diff --git a/configs/accl/async-pr.py b/configs/accl/async-pr.py
new file mode 100644
index 0000000000..0bfb6caeaa
--- /dev/null
+++ b/configs/accl/async-pr.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("alpha", type=float)
+    argparser.add_argument("threshold", type=float)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.graph,
+        args.alpha,
+        args.threshold,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph,
+        alpha,
+        threshold,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    system.create_async_pr_workload(alpha, threshold)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/bc.py b/configs/accl/bc.py
new file mode 100644
index 0000000000..56faeb3e4d
--- /dev/null
+++ b/configs/accl/bc.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.graph,
+        args.init_addr,
+        args.init_value,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph,
+        init_addr,
+        init_value,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_bsp_mode()
+    system.create_pop_count_directory(64)
+    system.create_bc_workload(init_addr, init_value)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        iterations = 0
+        while True:
+            exit_event = m5.simulate()
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            iterations += 1
+            if system.work_count() == 0:
+                break
+    print(f"#iterations: {iterations}")
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
new file mode 100644
index 0000000000..97f1b5dc21
--- /dev/null
+++ b/configs/accl/bfs.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--visited",
+        dest="visited",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use visitation version of BFS",
+    )
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.graph,
+        args.init_addr,
+        args.init_value,
+        args.visited,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph,
+        init_addr,
+        init_value,
+        visited,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    if visited:
+        system.create_bfs_visited_workload(init_addr, init_value)
+    else:
+        system.create_bfs_workload(init_addr, init_value)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/cc.py b/configs/accl/cc.py
new file mode 100644
index 0000000000..9b6d2b587d
--- /dev/null
+++ b/configs/accl/cc.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.graph,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    system.create_cc_workload()
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/pr.py b/configs/accl/pr.py
new file mode 100644
index 0000000000..569514eb82
--- /dev/null
+++ b/configs/accl/pr.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("iterations", type=int)
+    argparser.add_argument("alpha", type=float)
+    argparser.add_argument("--num_nodes", type=int, default=1)
+    argparser.add_argument("--error_threshold", type=float, default=0.0)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.graph,
+        args.iterations,
+        args.alpha,
+        args.num_nodes,
+        args.error_threshold,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph,
+        iterations,
+        alpha,
+        num_nodes,
+        error_threshold,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    print(f"error_threshold: {error_threshold}")
+
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_bsp_mode()
+    system.create_pop_count_directory(64)
+    system.create_pr_workload(num_nodes, alpha)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        iteration = 0
+        while iteration < iterations:
+            exit_event = m5.simulate()
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            iteration += 1
+            print(f"error: {system.get_pr_error()}")
+            if system.get_pr_error() < error_threshold:
+                break
+            if system.work_count() == 0:
+                break
+    print(f"#iterations: {iteration}")
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
new file mode 100644
index 0000000000..32d0dd26ab
--- /dev/null
+++ b/configs/accl/sega.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from math import log
+from m5.objects import *
+
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+    intlv_low_bit = log(cache_line_size, 2)
+    intlv_bits = log(num_channels, 2)
+    ret = []
+    for i in range(num_channels):
+        ret.append(
+            AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i,
+            )
+        )
+    return ret, intlv_low_bit + intlv_bits - 1
+
+
+class GPT(SubSystem):
+    def __init__(self, register_file_size: int, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+            update_queue_size=64, register_file_size=register_file_size
+        )
+        self.coalesce_engine = CoalesceEngine(
+            attached_memory_atom_size=32,
+            cache_size=cache_size,
+            max_resp_per_cycle=8,
+            pending_pull_limit=64,
+            active_buffer_size=80,
+            post_push_wb_queue_size=64,
+        )
+        self.push_engine = PushEngine(
+            push_req_queue_size=32,
+            attached_memory_atom_size=64,
+            resp_queue_size=4096,
+            max_propagates_per_cycle=8,
+            update_queue_size=32,
+        )
+
+        self.vertex_mem_ctrl = HBMCtrl(
+            dram=HBM_2000_4H_1x64(
+                page_policy="close", read_buffer_size=96, write_buffer_size=96
+            ),
+            dram_2=HBM_2000_4H_1x64(
+                page_policy="close", read_buffer_size=96, write_buffer_size=96
+            ),
+        )
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+
+        self.mpu = MPU(
+            wl_engine=self.wl_engine,
+            coalesce_engine=self.coalesce_engine,
+            push_engine=self.push_engine,
+        )
+
+    def getRespPort(self):
+        return self.wl_engine.in_ports
+
+    def setRespPort(self, port):
+        self.wl_engine.in_ports = port
+
+    def getReqPort(self):
+        return self.push_engine.out_ports
+
+    def setReqPort(self, port):
+        self.push_engine.out_ports = port
+
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
+    def set_vertex_range(self, vertex_ranges):
+        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
+        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
+
+    def set_vertex_pch_bit(self, pch_bit):
+        self.vertex_mem_ctrl.pch_bit = pch_bit
+
+class EdgeMemory(SubSystem):
+    def __init__(self, size: str):
+        super(EdgeMemory, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2.4GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
+        )
+        self.xbar = NoncoherentXBar(
+            width=64, frontend_latency=1, forward_latency=1, response_latency=1
+        )
+        self.xbar.mem_side_ports = self.mem_ctrl.port
+
+    def set_image(self, image):
+        self.mem_ctrl.dram.image_file = image
+
+    def getPort(self):
+        return self.xbar.cpu_side_ports
+
+    def setPort(self, port):
+        self.xbar.cpu_side_ports = port
+
+class SEGA(System):
+    def __init__(self, num_gpts, num_registers, cache_size, graph_path):
+        super(SEGA, self).__init__()
+        # num_gpts should be an even power of 2
+        assert num_gpts != 0
+        assert num_gpts % 2 == 0
+        assert (num_gpts & (num_gpts - 1)) == 0
+
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        # Building the CenteralController
+        self.ctrl = CenteralController(vertex_image_file=f"{graph_path}/vertices")
+        # Building the EdgeMemories
+        edge_mem = []
+        for i in range(int(num_gpts/2)):
+            mem = EdgeMemory("4GiB")
+            mem.set_image(f"{graph_path}/edgelist_{i}")
+            edge_mem.append(mem)
+        self.edge_mem = edge_mem
+        # Building the GPTs
+        vertex_ranges, pch_bit = interleave_addresses(
+            AddrRange(start=0, size="4GiB"), 2 * num_gpts, 32
+        )
+        gpts = []
+        for i in range(num_gpts):
+            gpt = GPT(num_registers, cache_size)
+            gpt.set_vertex_range(
+                [vertex_ranges[i], vertex_ranges[i + num_gpts]]
+            )
+            gpt.set_vertex_pch_bit(pch_bit)
+            gpt.setEdgeMemPort(self.edge_mem[i % (int(num_gpts/2))].getPort())
+            gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
+        self.gpts = gpts
+
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+
+    def work_count(self):
+        return self.ctrl.workCount()
+
+    def set_async_mode(self):
+        self.ctrl.setAsyncMode()
+
+    def set_bsp_mode(self):
+        self.ctrl.setBSPMode()
+
+    def create_pop_count_directory(self, atoms_per_block):
+        self.ctrl.createPopCountDirectory(atoms_per_block)
+
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.createBFSWorkload(init_addr, init_value)
+
+    def create_bfs_visited_workload(self, init_addr, init_value):
+        self.ctrl.createBFSVisitedWorkload(init_addr, init_value)
+
+    def create_sssp_workload(self, init_addr, init_value):
+        self.ctrl.createSSSPWorkload(init_addr, init_value)
+
+    def create_cc_workload(self):
+        self.ctrl.createCCWorkload()
+
+    def create_async_pr_workload(self, alpha, threshold):
+        self.ctrl.createAsyncPRWorkload(alpha, threshold)
+
+    def create_pr_workload(self, num_nodes, alpha):
+        self.ctrl.createPRWorkload(num_nodes, alpha)
+
+    def get_pr_error(self):
+        return self.ctrl.getPRError()
+
+    def create_bc_workload(self, init_addr, init_value):
+        self.ctrl.createBCWorkload(init_addr, init_value)
+
+    def print_answer(self):
+        self.ctrl.printAnswerToHostSimout()
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
new file mode 100644
index 0000000000..2d36ec584d
--- /dev/null
+++ b/configs/accl/sega_simple.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from math import log
+from m5.objects import *
+
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+    intlv_low_bit = log(cache_line_size, 2)
+    intlv_bits = log(num_channels, 2)
+    ret = []
+    for i in range(num_channels):
+        ret.append(
+            AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i,
+            )
+        )
+    return ret
+
+
+class GPT(SubSystem):
+    def __init__(self, register_file_size: int, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+            update_queue_size=64, register_file_size=register_file_size
+        )
+        self.coalesce_engine = CoalesceEngine(
+            attached_memory_atom_size=32,
+            cache_size=cache_size,
+            max_resp_per_cycle=8,
+            pending_pull_limit=64,
+            active_buffer_size=80,
+            post_push_wb_queue_size=64,
+        )
+        self.push_engine = PushEngine(
+            push_req_queue_size=32,
+            attached_memory_atom_size=64,
+            resp_queue_size=4096,
+            max_propagates_per_cycle=8,
+            update_queue_size=32,
+        )
+
+        self.vertex_mem_ctrl = SimpleMemory(
+            latency="122ns", latency_var="0ns", bandwidth="28GiB/s"
+        )
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+
+        self.mpu = MPU(
+            wl_engine=self.wl_engine,
+            coalesce_engine=self.coalesce_engine,
+            push_engine=self.push_engine,
+        )
+
+    def getRespPort(self):
+        return self.wl_engine.in_ports
+
+    def setRespPort(self, port):
+        self.wl_engine.in_ports = port
+
+    def getReqPort(self):
+        return self.push_engine.out_ports
+
+    def setReqPort(self, port):
+        self.push_engine.out_ports = port
+
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+
+
+class EdgeMemory(SubSystem):
+    def __init__(self, size: str):
+        super(EdgeMemory, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2.4GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
+        )
+        self.xbar = NoncoherentXBar(
+            width=64, frontend_latency=1, forward_latency=1, response_latency=1
+        )
+        self.xbar.mem_side_ports = self.mem_ctrl.port
+
+    def set_image(self, image):
+        self.mem_ctrl.dram.image_file = image
+
+    def getPort(self):
+        return self.xbar.cpu_side_ports
+
+    def setPort(self, port):
+        self.xbar.cpu_side_ports = port
+
+
+class SEGA(System):
+    def __init__(self, num_gpts, num_registers, cache_size, graph_path):
+        super(SEGA, self).__init__()
+        # num_gpts should be an even power of 2
+        assert num_gpts != 0
+        assert num_gpts % 2 == 0
+        assert (num_gpts & (num_gpts - 1)) == 0
+
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        # Building the CenteralController
+        self.ctrl = CenteralController(
+            vertex_image_file=f"{graph_path}/vertices"
+        )
+        # Building the EdgeMemories
+        edge_mem = []
+        for i in range(int(num_gpts / 2)):
+            mem = EdgeMemory("4GiB")
+            mem.set_image(f"{graph_path}/edgelist_{i}")
+            edge_mem.append(mem)
+        self.edge_mem = edge_mem
+        # Building the GPTs
+        vertex_ranges = interleave_addresses(
+            AddrRange(start=0, size="4GiB"), num_gpts, 32
+        )
+        gpts = []
+        for i in range(num_gpts):
+            gpt = GPT(num_registers, cache_size)
+            gpt.set_vertex_range(vertex_ranges[i])
+            gpt.setEdgeMemPort(
+                self.edge_mem[i % (int(num_gpts / 2))].getPort()
+            )
+            gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
+        self.gpts = gpts
+
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+
+    def work_count(self):
+        return self.ctrl.workCount()
+
+    def set_async_mode(self):
+        self.ctrl.setAsyncMode()
+
+    def set_bsp_mode(self):
+        self.ctrl.setBSPMode()
+
+    def create_pop_count_directory(self, atoms_per_block):
+        self.ctrl.createPopCountDirectory(atoms_per_block)
+
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.createBFSWorkload(init_addr, init_value)
+
+    def create_bfs_visited_workload(self, init_addr, init_value):
+        self.ctrl.createBFSVisitedWorkload(init_addr, init_value)
+
+    def create_sssp_workload(self, init_addr, init_value):
+        self.ctrl.createSSSPWorkload(init_addr, init_value)
+
+    def create_cc_workload(self):
+        self.ctrl.createCCWorkload()
+
+    def create_async_pr_workload(self, alpha, threshold):
+        self.ctrl.createAsyncPRWorkload(alpha, threshold)
+
+    def create_pr_workload(self, num_nodes, alpha):
+        self.ctrl.createPRWorkload(num_nodes, alpha)
+
+    def create_bc_workload(self, init_addr, init_value):
+        self.ctrl.createBCWorkload(init_addr, init_value)
+
+    def print_answer(self):
+        self.ctrl.printAnswerToHostSimout()
diff --git a/configs/accl/sssp.py b/configs/accl/sssp.py
new file mode 100644
index 0000000000..f2e60b856a
--- /dev/null
+++ b/configs/accl/sssp.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.graph,
+        args.init_addr,
+        args.init_value,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph,
+        init_addr,
+        init_value,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    system.create_sssp_workload(init_addr, init_value)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
new file mode 100644
index 0000000000..ebfca7e794
--- /dev/null
+++ b/src/accl/graph/TODO.md
@@ -0,0 +1,8 @@
+# TODO Items
+
+* We might need to revisit the fact that we could insert something to a queue on
+    the same cycle that another event is consuming something from the queue.
+* Move checking for wl.degree == 0 to coalesce engine.
+* Fix the retry system between memory queue and coalesce engine
+* Update inheritance: There is not enough reason for PushEngine and
+CoalesceEngine to be of the same type (i.e. delete BaseMemEngine).
diff --git a/src/accl/graph/base/BaseReduceEngine.py b/src/accl/graph/base/BaseReduceEngine.py
new file mode 100644
index 0000000000..0585c36e48
--- /dev/null
+++ b/src/accl/graph/base/BaseReduceEngine.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BaseReduceEngine(ClockedObject):
+    abstract = True
+    type = 'BaseReduceEngine'
+    cxx_header = "accl/graph/base/base_reduce_engine.hh"
+    cxx_class = 'gem5::BaseReduceEngine'
+
+    system = Param.System(Parent.any, 'System this Engine is a part of')
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
new file mode 100644
index 0000000000..35111c34d2
--- /dev/null
+++ b/src/accl/graph/base/SConscript
@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Import("*")
+
+SimObject("BaseReduceEngine.py", sim_objects=["BaseReduceEngine"])
+
+Source("base_reduce_engine.cc")
+Source("graph_workload.cc")
diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc
new file mode 100644
index 0000000000..ade95800d2
--- /dev/null
+++ b/src/accl/graph/base/base_reduce_engine.cc
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/base/base_reduce_engine.hh"
+
+namespace gem5
+{
+
+BaseReduceEngine::BaseReduceEngine(const Params &params):
+    ClockedObject(params),
+    system(params.system),
+    _requestorId(system->getRequestorId(this))
+{}
+
+BaseReduceEngine::~BaseReduceEngine()
+{}
+
+}
diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh
new file mode 100644
index 0000000000..268bb60b76
--- /dev/null
+++ b/src/accl/graph/base/base_reduce_engine.hh
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
+
+#include "params/BaseReduceEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class BaseReduceEngine : public ClockedObject
+{
+  private:
+    System* system;
+
+  protected:
+
+    const RequestorID _requestorId;
+
+  public:
+    PARAMS(BaseReduceEngine);
+    BaseReduceEngine(const Params &params);
+    ~BaseReduceEngine();
+
+    RequestorID requestorId() { return _requestorId; }
+};
+
+}
+
+#endif // __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
new file mode 100644
index 0000000000..a391e0794d
--- /dev/null
+++ b/src/accl/graph/base/data_structs.hh
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
+#define __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
+
+#include "base/cprintf.hh"
+#include "base/intmath.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <deque>
+
+namespace gem5
+{
+
+struct __attribute__ ((packed)) WorkListItem
+{
+    uint32_t tempProp : 32;
+    uint32_t prop : 32;
+    uint32_t edgeIndex : 32;
+    uint32_t degree : 30;
+    bool activeNow: 1;
+    bool activeFuture: 1;
+
+    std::string to_string()
+    {
+        return csprintf("WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, "
+                        "degree: %u, activeNow: %s, activeFuture: %s}",
+                        tempProp, prop, edgeIndex, degree,
+                        activeNow ? "true" : "false",
+                        activeFuture ? "true" : "false");
+    }
+
+    WorkListItem():
+        tempProp(0),
+        prop(0),
+        edgeIndex(0),
+        degree(0),
+        activeNow(false),
+        activeFuture(false)
+    {}
+
+    WorkListItem(uint32_t temp_prop, uint32_t prop,
+                uint32_t degree, uint32_t edge_index,
+                bool active_now, bool active_future):
+        tempProp(temp_prop), prop(prop), edgeIndex(edge_index), degree(degree),
+        activeNow(active_now), activeFuture(active_future)
+    {}
+
+};
+
+struct __attribute__ ((packed)) Edge
+{
+    uint16_t weight : 16;
+    uint64_t neighbor : 48;
+
+    std::string to_string()
+    {
+        return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor);
+    }
+
+    Edge(): weight(0), neighbor(0) {}
+
+    Edge(uint16_t weight, uint64_t neighbor):
+        weight(weight),
+        neighbor(neighbor)
+    {}
+};
+
+static_assert(isPowerOf2(sizeof(WorkListItem)));
+static_assert(isPowerOf2(sizeof(Edge)));
+
+struct MetaEdge {
+    uint64_t src;
+    uint64_t dst;
+    uint32_t weight;
+    uint32_t value;
+
+    MetaEdge(): src(0), dst(0), weight(0), value(0)
+    {}
+    MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value):
+        src(src), dst(dst), weight(weight), value(value)
+    {}
+
+    std::string to_string()
+    {
+        return csprintf("MetaEdge{src: %lu, dst:%lu, weight: %u, value: %u}",
+                                                    src, dst, weight, value);
+    }
+};
+
+struct Update {
+    uint64_t src;
+    uint64_t dst;
+    uint32_t value;
+
+    Update(): src(0), dst(0), value(0)
+    {}
+    Update(uint64_t src, uint64_t dst, uint32_t value):
+        src(src), dst(dst), value(value)
+    {}
+
+    std::string to_string()
+    {
+        return csprintf("Update{src: %lu, dst:%lu, value: %u}",
+                                                src, dst, value);
+    }
+};
+
+template<typename T>
+class UniqueFIFO
+{
+  private:
+    int cap;
+    int pop;
+
+    int* added;
+    int* deleted;
+    std::deque<T> container;
+
+  public:
+    UniqueFIFO() {
+        cap = 0;
+        pop = 0;
+        added = nullptr;
+        deleted = nullptr;
+    }
+
+    UniqueFIFO(int size) {
+        cap = size;
+        pop = 0;
+
+        added = (int*) new int [cap];
+        deleted = (int*) new int [cap];
+
+        for (int i = 0; i < cap; i++) {
+            added[i] = 0;
+            deleted[i] = 0;
+        }
+        container.clear();
+    }
+
+    void fix_front() {
+        while(true) {
+            T elem = container.front();
+            if (deleted[elem] > 0) {
+                deleted[elem]--;
+                added[elem]--;
+                container.pop_front();
+            } else {
+                assert(deleted[elem] == 0);
+                assert(added[elem] == 1);
+                break;
+            }
+        }
+    }
+
+    T front() {
+        fix_front();
+        return container.front();
+    }
+
+    size_t size() {
+        return pop;
+    }
+
+    void clear() {
+        pop = 0;
+        for (int i = 0; i < cap; i++) {
+            added[i] = 0;
+            deleted[i] = 0;
+        }
+        container.clear();
+    }
+
+    bool empty() {
+        return size() == 0;
+    }
+
+    bool find(T item) {
+        assert(added[item] >= 0);
+        assert(deleted[item] >= 0);
+        int diff = added[item] - deleted[item];
+        assert((diff == 0) || (diff == 1));
+        return (diff == 1);
+    }
+
+    void push_back(T item) {
+        if (!find(item)) {
+            added[item]++;
+            pop++;
+            container.push_back(item);
+        }
+    }
+
+    void pop_front() {
+        T elem = front();
+        assert(added[elem] == 1);
+        added[elem] = 0;
+        pop--;
+        container.pop_front();
+    }
+
+    void erase(T item) {
+        assert(find(item));
+        deleted[item]++;
+        pop--;
+    }
+
+    void operator=(const UniqueFIFO<T>& rhs) {
+        pop = rhs.pop;
+        container = rhs.container;
+        added = rhs.added;
+        deleted = rhs.deleted;
+    }
+};
+
+}
+
+#endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
new file mode 100644
index 0000000000..fd802cf275
--- /dev/null
+++ b/src/accl/graph/base/graph_workload.cc
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/base/graph_workload.hh"
+
+#include <cstring>
+
+#include "base/cprintf.hh"
+#include "base/intmath.hh"
+
+namespace gem5
+{
+
+template<typename T>
+float
+writeToFloat(T value)
+{
+    assert(sizeof(T) == sizeof(float));
+    float float_form;
+    std::memcpy(&float_form, &value, sizeof(float));
+    return float_form;
+}
+
+template<typename T>
+T
+readFromFloat(float value)
+{
+    assert(sizeof(T) == sizeof(float));
+    T float_bits;
+    std::memcpy(&float_bits, &value, sizeof(float));
+    return float_bits;
+}
+
+void
+BFSWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    size_t pkt_size = pkt->getSize();
+    uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
+
+    if (pkt->getAddr() == aligned_addr) {
+        int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+        WorkListItem items[num_elements];
+
+        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+        int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
+        WorkListItem new_wl = items[index];
+        new_wl.tempProp = initValue;
+        if (activeCondition(new_wl, items[index])) {
+            new_wl.activeNow = true;
+            dir->activate(aligned_addr);
+        }
+        items[index] = new_wl;
+
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+    }
+}
+
+uint32_t
+BFSWorkload::reduce(uint32_t update, uint32_t value)
+{
+    return std::min(update, value);
+}
+
+uint32_t
+BFSWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return value + 1;
+}
+
+bool
+BFSWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
+{
+    return (new_wl.tempProp < old_wl.tempProp) && (old_wl.degree > 0);
+}
+
+uint32_t
+BFSWorkload::apply(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+    return wl.prop;
+}
+
+std::string
+BFSWorkload::printWorkListItem(const WorkListItem wl)
+{
+    return csprintf(
+            "WorkListItem{tempProp: %u, prop: %u, degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
+}
+
+uint32_t
+BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight) {
+    return value;
+}
+
+void
+CCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    Addr pkt_addr = pkt->getAddr();
+    size_t pkt_size = pkt->getSize();
+    int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+    WorkListItem items[num_elements];
+
+    pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+    bool atom_active = false;
+    for (int i = 0; i < num_elements; i++) {
+        WorkListItem new_wl = items[i];
+        new_wl.tempProp = (int) (pkt_addr / sizeof(WorkListItem)) + i;
+        new_wl.activeNow = activeCondition(new_wl, items[i]);
+        atom_active |= new_wl.activeNow;
+        items[i] = new_wl;
+    }
+    if (atom_active) {
+        dir->activate(pkt->getAddr());
+    }
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+}
+
+uint32_t
+SSSPWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return value + weight;
+}
+
+void
+PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    int num_elements = pkt->getSize() / sizeof(WorkListItem);
+    WorkListItem items[num_elements];
+    pkt->writeDataToBlock((uint8_t*) items, pkt->getSize());
+
+    bool atom_active = false;
+    for (int index = 0; index < num_elements; index++) {
+        WorkListItem new_wl = items[index];
+        new_wl.tempProp = readFromFloat<uint32_t>(0);
+        new_wl.prop = readFromFloat<uint32_t>(1 - alpha);
+        new_wl.activeNow = activeCondition(new_wl, items[index]);
+        atom_active |= new_wl.activeNow;
+        items[index] = new_wl;
+    }
+    if (atom_active) {
+        dir->activate(pkt->getAddr());
+    }
+
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, pkt->getSize());
+}
+
+uint32_t
+PRWorkload::reduce(uint32_t update, uint32_t value)
+{
+    float update_float = writeToFloat<uint32_t>(update);
+    float value_float = writeToFloat<uint32_t>(value);
+    return readFromFloat<uint32_t>(update_float + value_float);
+}
+
+uint32_t
+PRWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    float value_float = writeToFloat<uint32_t>(value);
+    float weight_float = writeToFloat<uint32_t>(weight);
+    if (weight == 0) {
+        weight_float = 1.0;
+    }
+    return readFromFloat<uint32_t>(alpha * value_float * weight_float);
+}
+
+bool
+PRWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
+{
+    float temp_float = writeToFloat<uint32_t>(new_wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(new_wl.prop);
+    float dist = std::abs(temp_float - prop_float);
+    return (dist >= threshold) && (new_wl.degree > 0);
+}
+
+uint32_t
+PRWorkload::apply(WorkListItem& wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    float delta = (temp_float - prop_float) / wl.degree;
+    wl.prop = wl.tempProp;
+    return readFromFloat<uint32_t>(delta);
+}
+
+std::string
+PRWorkload::printWorkListItem(const WorkListItem wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    return csprintf(
+            "WorkListItem{tempProp: %f, prop: %f, degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            temp_float, prop_float, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
+}
+
+void
+BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    size_t pkt_size = pkt->getSize();
+    int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+    WorkListItem items[num_elements];
+
+    pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+    bool atom_active = false;
+    for (int i = 0; i < num_elements; i++) {
+        WorkListItem new_wl = items[i];
+        new_wl.tempProp = readFromFloat<uint32_t>((1 - alpha)/numNodes);
+        new_wl.prop = readFromFloat<uint32_t>(1/numNodes);
+        new_wl.activeNow = activeCondition(new_wl, items[i]);
+        atom_active |= new_wl.activeNow;
+        items[i] = new_wl;
+    }
+    if (atom_active) {
+        dir->activate(pkt->getAddr());
+    }
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+}
+
+uint32_t
+BSPPRWorkload::reduce(uint32_t update, uint32_t value)
+{
+    float update_float = writeToFloat<uint32_t>(update);
+    float value_float = writeToFloat<uint32_t>(value);
+    return readFromFloat<uint32_t>(update_float + value_float);
+}
+
+uint32_t
+BSPPRWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    float value_float = writeToFloat<uint32_t>(value);
+    return readFromFloat<uint32_t>(alpha * value_float);
+}
+
+bool
+BSPPRWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
+{
+    return (old_wl.degree > 0);
+}
+
+uint32_t
+BSPPRWorkload::apply(WorkListItem& wl)
+{
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    float delta = prop_float / wl.degree;
+    uint32_t delta_uint = readFromFloat<uint32_t>(delta);
+    return delta_uint;
+}
+
+void
+BSPPRWorkload::interIterationInit(WorkListItem& wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    error += std::abs(temp_float - prop_float);
+    wl.prop = wl.tempProp;
+    wl.tempProp = readFromFloat<uint32_t>((1 - alpha) / numNodes);
+    wl.activeFuture = (wl.degree > 0);
+}
+
+std::string
+BSPPRWorkload::printWorkListItem(const WorkListItem wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    return csprintf(
+            "WorkListItem{tempProp: %f, prop: %f, degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            temp_float, prop_float, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
+}
+
+void
+BSPBCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    int pkt_size = pkt->getSize();
+    int aligned_addr = roundDown<uint32_t, size_t>(initAddr, pkt_size);
+
+    if (aligned_addr == pkt->getAddr()) {
+        int num_elements = pkt_size / sizeof(WorkListItem);
+        WorkListItem items[num_elements];
+        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+        int index = (initAddr - aligned_addr) / sizeof(WorkListItem);
+        WorkListItem new_wl = items[index];
+        uint32_t prop = 0;
+        prop |= initValue;
+        // NOTE: Depth of the initial vertex is 0.
+        prop &= countMask;
+        new_wl.tempProp = prop;
+        new_wl.prop = prop;
+        if (activeCondition(new_wl, items[index])) {
+            new_wl.activeNow = true;
+            dir->activate(aligned_addr);
+        }
+        items[index] = new_wl;
+
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+    }
+}
+
+uint32_t
+BSPBCWorkload::reduce(uint32_t update, uint32_t value)
+{
+    uint32_t update_depth = (update & depthMask) >> 24;
+    uint32_t update_count = (update & countMask);
+    uint32_t value_depth = (value & depthMask) >> 24;
+    uint32_t value_count = (value & countMask);
+    if (value_depth == 255) {
+        value_depth = currentDepth;
+        value_count = 0;
+    }
+    if (value_depth == currentDepth) {
+        value_count += update_count;
+    }
+    uint32_t ret = 0;
+    ret |= value_count;
+    warn_if(value_count > 16777215, "value count has grown bigger than 16777125."
+                                " This means the algorithm result might not be correct."
+                                " However, the traversal will not be affected."
+                                " Therefore, performane metrics could be used.");
+    // HACK: Make sure to always set the depth correctly even if count
+    // exceeds the 2^24-1 limit. Here we reset the depth section of ret.
+    ret &= countMask;
+    // NOTE: Now that the depth is securely reset we can copy the correct value.
+    ret |= (value_depth << 24);
+    return ret;
+}
+
+uint32_t
+BSPBCWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return value;
+}
+
+uint32_t
+BSPBCWorkload::apply(WorkListItem& wl)
+{
+    return wl.prop;
+}
+
+void
+BSPBCWorkload::interIterationInit(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+}
+
+bool
+BSPBCWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
+{
+    uint32_t depth = (new_wl.tempProp & depthMask) >> 24;
+    return (depth == currentDepth) && (new_wl.degree > 0);
+}
+
+std::string
+BSPBCWorkload::printWorkListItem(WorkListItem wl)
+{
+    uint32_t temp_depth = (wl.tempProp & depthMask) >> 24;
+    uint32_t temp_count = (wl.tempProp & countMask);
+    uint32_t depth = (wl.prop & depthMask) >> 24;
+    uint32_t count = (wl.prop & countMask);
+    return csprintf(
+            "WorkListItem{tempProp: (depth: %d, count: %d), "
+            "prop: (depth: %d, count: %d), degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            temp_depth, temp_count, depth, count, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
new file mode 100644
index 0000000000..72748502c1
--- /dev/null
+++ b/src/accl/graph/base/graph_workload.hh
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
+#define  __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
+
+#include <bitset>
+#include <deque>
+#include <tuple>
+
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/sega/work_directory.hh"
+#include "mem/packet.hh"
+
+
+namespace gem5
+{
+
+class GraphWorkload
+{
+  public:
+    GraphWorkload() {}
+    ~GraphWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir) = 0;
+    virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
+    virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
+    virtual uint32_t apply(WorkListItem& wl) = 0;
+    virtual void iterate() = 0;
+    virtual void interIterationInit(WorkListItem& wl) = 0;
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl) = 0;
+    virtual std::string printWorkListItem(const WorkListItem wl) = 0;
+};
+
+class BFSWorkload : public GraphWorkload
+{
+  private:
+    uint64_t initAddr;
+    uint32_t initValue;
+
+  public:
+    BFSWorkload(uint64_t init_addr, uint32_t init_value):
+        initAddr(init_addr), initValue(init_value)
+    {}
+
+    ~BFSWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() {}
+    virtual void interIterationInit(WorkListItem& wl) {}
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
+
+class BFSVisitedWorkload : public BFSWorkload
+{
+  public:
+    BFSVisitedWorkload(Addr init_addr, uint32_t init_value):
+        BFSWorkload(init_addr, init_value)
+    {}
+    virtual uint32_t propagate(uint32_t value, uint32_t weight) override;
+};
+
+class CCWorkload : public BFSVisitedWorkload
+{
+  public:
+    CCWorkload(): BFSVisitedWorkload(0, 0) {}
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+};
+
+class SSSPWorkload : public BFSWorkload
+{
+  public:
+    SSSPWorkload(Addr init_addr, uint32_t init_value):
+        BFSWorkload(init_addr, init_value)
+    {}
+    virtual uint32_t propagate(uint32_t value, uint32_t weight) override;
+};
+
+class PRWorkload : public GraphWorkload
+{
+  private:
+    float alpha;
+    float threshold;
+
+  public:
+    PRWorkload(float alpha, float threshold):
+        alpha(alpha), threshold(threshold)
+    {}
+
+    ~PRWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() {}
+    virtual void interIterationInit(WorkListItem& wl) {};
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
+
+class BSPPRWorkload : public GraphWorkload
+{
+  private:
+    int numNodes;
+    float alpha;
+    float prevError;
+    float error;
+
+  public:
+    BSPPRWorkload(int num_nodes, float alpha):
+        numNodes(num_nodes), alpha(alpha), prevError(0), error(0)
+    {}
+
+    ~BSPPRWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() { prevError = error; error = 0; }
+    virtual void interIterationInit(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+
+    float getError() { return prevError; }
+};
+
+class BSPBCWorkload : public GraphWorkload
+{
+  private:
+    Addr initAddr;
+    uint32_t initValue;
+
+    int currentDepth;
+
+    uint32_t depthMask;
+    uint32_t countMask;
+  public:
+    BSPBCWorkload(Addr init_addr, uint32_t init_value):
+        initAddr(init_addr), initValue(init_value),
+        currentDepth(0), depthMask(4278190080), countMask(16777215)
+    {}
+
+    ~BSPBCWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() { currentDepth++; }
+    virtual void interIterationInit(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
+
+}
+
+#endif // __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
diff --git a/src/accl/graph/sega/BaseMemoryEngine.py b/src/accl/graph/sega/BaseMemoryEngine.py
new file mode 100644
index 0000000000..10d8b708f0
--- /dev/null
+++ b/src/accl/graph/sega/BaseMemoryEngine.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BaseMemoryEngine(ClockedObject):
+    abstract = True
+    type = 'BaseMemoryEngine'
+    cxx_header = "accl/graph/sega/base_memory_engine.hh"
+    cxx_class = 'gem5::BaseMemoryEngine'
+
+    system = Param.System(Parent.any, 'System this Engine is a part of')
+    mem_port  = RequestPort("Port to communicate with the memory")
+
+    attached_memory_atom_size = Param.Int(64, "The atom size of the attached "
+                                    "memory.")
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
new file mode 100644
index 0000000000..c5f44c82e9
--- /dev/null
+++ b/src/accl/graph/sega/CenteralController.py
@@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.util.pybind import PyBindMethod
+from m5.objects.ClockedObject import ClockedObject
+
+class CenteralController(ClockedObject):
+    type = 'CenteralController'
+    cxx_header = "accl/graph/sega/centeral_controller.hh"
+    cxx_class = 'gem5::CenteralController'
+
+    system = Param.System(Parent.any, "System this Engine is a part of")
+
+    vertex_image_file = Param.String("Path to the vertex image file.")
+
+    mpu_vector = VectorParam.MPU("All mpus in the system.")
+
+    cxx_exports = [
+                    PyBindMethod("setAsyncMode"),
+                    PyBindMethod("setBSPMode"),
+                    PyBindMethod("createPopCountDirectory"),
+                    PyBindMethod("createBFSWorkload"),
+                    PyBindMethod("createBFSVisitedWorkload"),
+                    PyBindMethod("createSSSPWorkload"),
+                    PyBindMethod("createCCWorkload"),
+                    PyBindMethod("createAsyncPRWorkload"),
+                    PyBindMethod("createPRWorkload"),
+                    PyBindMethod("createBCWorkload"),
+                    PyBindMethod("workCount"),
+                    PyBindMethod("getPRError"),
+                    PyBindMethod("printAnswerToHostSimout")
+                ]
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
new file mode 100644
index 0000000000..25f8a1c58b
--- /dev/null
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.BaseMemoryEngine import BaseMemoryEngine
+
+class CoalesceEngine(BaseMemoryEngine):
+    type = 'CoalesceEngine'
+    cxx_header = "accl/graph/sega/coalesce_engine.hh"
+    cxx_class = 'gem5::CoalesceEngine'
+
+    cache_size = Param.MemorySize("Size of the internal SRAM array.")
+
+    max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
+                                "requestor in each cycle. Used to limit b/w.")
+    pending_pull_limit = Param.Int("Maximum number of pending pull processes.")
+    active_buffer_size = Param.Int("Maximum number of memory active memory "
+                                "atoms ready to send updates. This parameter "
+                                "and post_push_wb_queue_size should be set "
+                                "in tandem. Probably, they should be equal.")
+    post_push_wb_queue_size = Param.Int("Maximum number of pending wb after "
+                                "apply process for applications that require "
+                                "the apply process to happen exactly before "
+                                "pushing the edgePointer to the PushEngine.")
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
new file mode 100644
index 0000000000..79fa7db8d0
--- /dev/null
+++ b/src/accl/graph/sega/MPU.py
@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+# from m5.SimObject import SimObject
+from m5.objects.ClockedObject import ClockedObject
+
+class MPU(ClockedObject):
+    type = "MPU"
+    cxx_header = "accl/graph/sega/mpu.hh"
+    cxx_class = "gem5::MPU"
+
+    system = Param.System(Parent.any, "System this MPU is a part of")
+
+    wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of "
+                                "MPU object.")
+    coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for "
+                                "each instance of MPU object.")
+    push_engine = Param.PushEngine(NULL, "Internal PushEngine for each "
+                                "instance of MPU object.")
+
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
new file mode 100644
index 0000000000..63fa1eae62
--- /dev/null
+++ b/src/accl/graph/sega/PushEngine.py
@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.BaseMemoryEngine import BaseMemoryEngine
+
+class PushEngine(BaseMemoryEngine):
+    type = 'PushEngine'
+    cxx_header = "accl/graph/sega/push_engine.hh"
+    cxx_class = 'gem5::PushEngine'
+
+    push_req_queue_size = Param.Int("Size of the queue to "
+                                    "queue push requests.")
+    # resp_queue_size should probably be
+    # significantly bigger than push_req_queue_size
+    resp_queue_size = Param.Int("Size of the response queue in the "
+                                    "push engine where it stores the "
+                                    "edges read from memory.")
+
+    max_propagates_per_cycle = Param.Int("Maximum number of propagates "
+                                                        "done per cycle.")
+
+    update_queue_size = Param.Int("Maximum number of entries "
+                                    "for each update queue.")
+
+    out_ports = VectorRequestPort("Outgoing ports to all MPUs")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
new file mode 100644
index 0000000000..b3e1a838fb
--- /dev/null
+++ b/src/accl/graph/sega/SConscript
@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Import("*")
+
+SimObject("BaseMemoryEngine.py", sim_objects=["BaseMemoryEngine"])
+SimObject("CenteralController.py", sim_objects=["CenteralController"])
+SimObject("CoalesceEngine.py", sim_objects=["CoalesceEngine"])
+SimObject("MPU.py", sim_objects=["MPU"])
+SimObject("PushEngine.py", sim_objects=["PushEngine"])
+SimObject("WLEngine.py", sim_objects=["WLEngine"])
+
+Source("base_memory_engine.cc")
+Source("centeral_controller.cc")
+Source("coalesce_engine.cc")
+Source("enums.cc")
+Source("mpu.cc")
+Source("push_engine.cc")
+Source("wl_engine.cc")
+
+DebugFlag("BaseMemoryEngine")
+DebugFlag("CenteralController")
+DebugFlag("CacheBlockState")
+DebugFlag("CoalesceEngine")
+DebugFlag("PushEngine")
+DebugFlag("SEGAStructureSize")
+DebugFlag("MSDebug")
+DebugFlag("WLEngine")
+
+CompoundFlag("MPU", ["CoalesceEngine", "PushEngine",
+                    "WLEngine", "BaseMemoryEngine"])
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
new file mode 100644
index 0000000000..5a8ed9c9fd
--- /dev/null
+++ b/src/accl/graph/sega/WLEngine.py
@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.BaseReduceEngine import BaseReduceEngine
+
+class WLEngine(BaseReduceEngine):
+    type = 'WLEngine'
+    cxx_header = "accl/graph/sega/wl_engine.hh"
+    cxx_class = 'gem5::WLEngine'
+
+    in_ports = VectorResponsePort("Incoming Ports to receive updates from "
+                                                "remote outside")
+
+    update_queue_size = Param.Int("Size of the queue WLEngine stores "
+                                        "the incoming updates")
+
+    register_file_size = Param.Int("Number of internal registers the "
+                                    "WLEngine has. It can service as "
+                                    "many updates as this queueu has "
+                                    "entries at the same time.")
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
new file mode 100644
index 0000000000..9f704f71e9
--- /dev/null
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/base_memory_engine.hh"
+
+#include "debug/BaseMemoryEngine.hh"
+#include "debug/SEGAStructureSize.hh"
+
+namespace gem5
+{
+
+BaseMemoryEngine::BaseMemoryEngine(const BaseMemoryEngineParams &params):
+    ClockedObject(params),
+    system(params.system),
+    _requestorId(system->getRequestorId(this)),
+    memPort(name() + ".mem_port", this),
+    peerMemoryAtomSize(params.attached_memory_atom_size)
+{}
+
+BaseMemoryEngine::~BaseMemoryEngine()
+{}
+
+Port&
+BaseMemoryEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "mem_port") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+void
+BaseMemoryEngine::init()
+{
+    AddrRangeList memory_ranges = memPort.getAddrRanges();
+
+    assert(memory_ranges.size() == 1);
+
+    peerMemoryRange = memory_ranges.front();
+
+    DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is "
+            "%s. The range is %s interleaved.\n", __func__,
+            peerMemoryRange.to_string(),
+            peerMemoryRange.interleaved() ? "" : "not");
+}
+
+void
+BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    DPRINTF(BaseMemoryEngine, "%s: Sending pakcet: %s to "
+                "the memory.\n", __func__, pkt->print());
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+        DPRINTF(BaseMemoryEngine, "%s: MemPort blocked.\n", __func__);
+    } else {
+        DPRINTF(BaseMemoryEngine, "%s: Packet sent successfully.\n", __func__);
+        owner->recvMemRetry();
+    }
+}
+
+bool
+BaseMemoryEngine::MemPort::recvTimingResp(PacketPtr pkt)
+{
+    return owner->handleMemResp(pkt);
+}
+
+void
+BaseMemoryEngine::MemPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket),
+            "Received retry without a blockedPacket");
+
+    _blocked = false;
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
+}
+
+PacketPtr
+BaseMemoryEngine::createReadPacket(Addr addr, unsigned int size)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    // Embed it in a packet
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    pkt->allocate();
+
+    return pkt;
+}
+
+PacketPtr
+BaseMemoryEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+    pkt->allocate();
+    pkt->setData(data);
+
+    return pkt;
+}
+
+}
diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh
new file mode 100644
index 0000000000..afe7fd0433
--- /dev/null
+++ b/src/accl/graph/sega/base_memory_engine.hh
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__
+
+#include <unordered_map>
+
+#include "base/addr_range.hh"
+#include "mem/packet.hh"
+#include "mem/port.hh"
+#include "params/BaseMemoryEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class BaseMemoryEngine : public ClockedObject
+{
+  protected:
+    class MemoryEvent : public EventFunctionWrapper
+    {
+      private:
+        bool _pending;
+        int _prevState;
+
+      public:
+        MemoryEvent(const std::function<void(void)> &callback,
+                    const std::string &name):
+            EventFunctionWrapper(callback, name),
+            _pending(false), _prevState(0)
+        {}
+        bool pending() { return _pending; }
+        void sleep() { _pending = true; }
+        void wake() { _pending = false; }
+        void setPrevState(int state) { _prevState = state; }
+        int getPrevState() { return _prevState; }
+    };
+
+    class MemPort : public RequestPort
+    {
+      private:
+        BaseMemoryEngine* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+        public:
+        MemPort(const std::string& name, BaseMemoryEngine* owner):
+            RequestPort(name, owner), owner(owner),
+            _blocked(false), blockedPacket(nullptr)
+        {}
+
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+        protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    System* system;
+    const RequestorID _requestorId;
+
+    MemPort memPort;
+    AddrRange peerMemoryRange;
+    size_t peerMemoryAtomSize;
+
+    virtual void recvMemRetry() = 0;
+    virtual bool handleMemResp(PacketPtr pkt) = 0;
+
+    PacketPtr createReadPacket(Addr addr, unsigned int size);
+    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
+
+  public:
+    PARAMS(BaseMemoryEngine);
+
+    BaseMemoryEngine(const Params &params);
+    ~BaseMemoryEngine();
+
+    Port& getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
+
+    AddrRangeList getAddrRanges() { return memPort.getAddrRanges(); }
+
+    virtual void recvFunctional(PacketPtr pkt) = 0;
+
+    virtual void init() override;
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__
diff --git a/src/accl/graph/sega/busyMaskErr b/src/accl/graph/sega/busyMaskErr
new file mode 100644
index 0000000000..316fcd37d9
--- /dev/null
+++ b/src/accl/graph/sega/busyMaskErr
@@ -0,0 +1,16 @@
+gem5/build/NULL/gem5.opt -re --outdir=debug --debug-flags=CacheBlockState gem5/configs/accl/sega.py 1 1KiB /home/fariborz/SEGA/graphs/test/scale_21/binaries/mpu_1/ 0 0
+
+32964143000: system.gpts.coalesce_engine: handleMemResp: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}.
+32964143000: system.gpts.coalesce_engine: handleMemResp: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 2, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}.
+32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 2, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}.
+32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964145000}.
+32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: true, pendingWB: false, lastChangedTick: 32964145000}.
+32964146000: system.gpts.coalesce_engine: processNextApplyEvent: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: true, pendingWB: false, lastChangedTick: 32964145000}.
+32964146000: system.gpts.coalesce_engine: processNextApplyEvent: cacheBlock[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: true, lastChangedTick: 32964146000}.
+32964146000: system.gpts.coalesce_engine: recvWLRead: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: true, lastChangedTick: 32964146000}.
+32964146000: system.gpts.coalesce_engine: recvWLRead: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 1, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964146000}.
+32964147000: system.gpts.coalesce_engine: processNextWriteBack: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 1, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964146000}.
+
+// This assertion would be hit although it should not.
+// It is fixed by a hack in recvWLRead when hit in the cache.
+assert(cacheBlocks[block_index].busyMask == 0);
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
new file mode 100644
index 0000000000..fc4bacd414
--- /dev/null
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/centeral_controller.hh"
+
+#include <iostream>
+
+#include "base/cprintf.hh"
+#include "base/loader/memory_image.hh"
+#include "base/loader/object_file.hh"
+#include "debug/CenteralController.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+CenteralController::CenteralController(const Params& params):
+    ClockedObject(params),
+    system(params.system),
+    mode(ProcessingMode::NOT_SET)
+{
+    for (auto mpu : params.mpu_vector) {
+        mpuVector.push_back(mpu);
+        mpu->registerCenteralController(this);
+    }
+}
+
+void
+CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new BFSWorkload(init_addr, init_value);
+}
+
+void
+CenteralController::createBFSVisitedWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new BFSVisitedWorkload(init_addr, init_value);
+}
+
+void
+CenteralController::createSSSPWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new SSSPWorkload(init_addr, init_value);
+}
+
+void
+CenteralController::createCCWorkload()
+{
+    workload = new CCWorkload();
+}
+
+void
+CenteralController::createAsyncPRWorkload(float alpha, float threshold)
+{
+    workload = new PRWorkload(alpha, threshold);
+}
+
+void
+CenteralController::createPRWorkload(int num_nodes, float alpha)
+{
+    workload = new BSPPRWorkload(num_nodes, alpha);
+}
+
+void
+CenteralController::createBCWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new BSPBCWorkload(init_addr, init_value);
+}
+
+bool
+CenteralController::bufferRemoteUpdate(int slice_number, PacketPtr pkt)
+{
+    for (auto mpu: mpuVector) {
+        if (contains(mpu->getAddrRanges(), pkt->getAddr())) {
+            remoteUpdates[mpu][slice_number].push_back(pkt);
+        }
+    }
+    
+    return true;
+}
+
+void
+CenteralController::createPopCountDirectory(int atoms_per_block)
+{
+    fatal_if(mode == ProcessingMode::NOT_SET, "You should set the processing "
+                        "mode by calling either setAsyncMode or setBSPMode.");
+    if (mode == ProcessingMode::ASYNCHRONOUS) {
+        for (auto mpu: mpuVector) {
+            mpu->createAsyncPopCountDirectory(atoms_per_block);
+        }
+    }
+    if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        for (auto mpu: mpuVector) {
+            mpu->createBSPPopCountDirectory(atoms_per_block);
+        }
+    }
+}
+
+void
+CenteralController::startup()
+{
+    unsigned int vertex_atom = mpuVector.front()->vertexAtomSize();
+    for (auto mpu: mpuVector) {
+        addrRangeListMap[mpu] = mpu->getAddrRanges();
+        mpu->setProcessingMode(mode);
+        mpu->recvWorkload(workload);
+    }
+
+    const auto& vertex_file = params().vertex_image_file;
+    if (vertex_file == "")
+        return;
+
+    auto* object = loader::createObjectFile(vertex_file, true);
+    fatal_if(!object, "%s: Could not load %s.", name(), vertex_file);
+
+    loader::debugSymbolTable.insert(*object->symtab().globals());
+    loader::MemoryImage vertex_image = object->buildImage();
+    maxVertexAddr = vertex_image.maxAddr();
+
+    PortProxy vertex_proxy(
+    [this](PacketPtr pkt) {
+        for (auto mpu: mpuVector) {
+            AddrRangeList range_list = addrRangeListMap[mpu];
+            if (contains(range_list, pkt->getAddr())) {
+                mpu->recvFunctional(pkt);
+            }
+        }
+    }, vertex_atom);
+
+    panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image.");
+
+    for (auto mpu: mpuVector) {
+        mpu->postMemInitSetup();
+        if (!mpu->running() && (mpu->workCount() > 0)) {
+            mpu->start();
+        }
+    }
+    workload->iterate();
+}
+
+PacketPtr
+CenteralController::createReadPacket(Addr addr, unsigned int size)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, 0);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) 0) << 2);
+
+    // Embed it in a packet
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    pkt->allocate();
+
+    return pkt;
+}
+
+void
+CenteralController::recvDoneSignal()
+{
+    bool done = true;
+    for (auto mpu : mpuVector) {
+        done &= mpu->done();
+        int total_num_slices = remoteUpdates[mpu].size();
+        if (mpu->done()) {
+            int slice_number = mpu->getSliceCounter() + 1;
+            while ((total_num_slices != 0) && (slice_number != mpu->getSliceCounter())) {
+                if (!remoteUpdates[mpu][slice_number].empty()) {
+                    mpu->scheduleNewSlice();
+                    mpu->updateSliceCounter(slice_number);
+                    done = false;
+                    break;
+                } 
+                else {
+                    if (slice_number == total_num_slices) {
+                        slice_number = 0;
+                    } else {
+                        slice_number++;
+                    }
+                }
+            }
+        }
+    }
+
+    if (done && mode == ProcessingMode::ASYNCHRONOUS) {
+        exitSimLoopNow("no update left to process.");
+    }
+
+    if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        for (auto mpu: mpuVector) {
+            mpu->postConsumeProcess();
+            mpu->swapDirectories();
+            if (!mpu->running() && (mpu->workCount() > 0)) {
+                mpu->start();
+            }
+        }
+        workload->iterate();
+        exitSimLoopNow("finished an iteration.");
+    }
+}
+
+int
+CenteralController::workCount()
+{
+    int work_count = 0;
+    for (auto mpu: mpuVector) {
+        work_count += mpu->workCount();
+    }
+    return work_count;
+}
+
+float
+CenteralController::getPRError()
+{
+    BSPPRWorkload* pr_workload = dynamic_cast<BSPPRWorkload*>(workload);
+    return pr_workload->getError();
+}
+
+void
+CenteralController::printAnswerToHostSimout()
+{
+    unsigned int vertex_atom = mpuVector.front()->vertexAtomSize();
+    int num_items = vertex_atom / sizeof(WorkListItem);
+    WorkListItem items[num_items];
+    for (Addr addr = 0; addr < maxVertexAddr; addr += vertex_atom)
+    {
+        PacketPtr pkt = createReadPacket(addr, vertex_atom);
+        for (auto mpu: mpuVector) {
+            AddrRangeList range_list = addrRangeListMap[mpu];
+            if (contains(range_list, addr)) {
+                mpu->recvFunctional(pkt);
+            }
+        }
+        pkt->writeDataToBlock((uint8_t*) items, vertex_atom);
+        for (int i = 0; i < num_items; i++) {
+            std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i,
+                                        workload->printWorkListItem(items[i]));
+
+            std::cout << print << std::endl;
+        }
+    }
+}
+
+}
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
new file mode 100644
index 0000000000..6692d999ed
--- /dev/null
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
+#define __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
+
+#include <vector>
+
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/enums.hh"
+#include "accl/graph/sega/mpu.hh"
+#include "base/addr_range.hh"
+#include "params/CenteralController.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class CenteralController : public ClockedObject
+{
+  private:
+    System* system;
+    Addr maxVertexAddr;
+
+    ProcessingMode mode;
+
+    std::vector<MPU*> mpuVector;
+    std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
+
+    PacketPtr createReadPacket(Addr addr, unsigned int size);
+
+  public:
+
+    GraphWorkload* workload;
+
+    PARAMS(CenteralController);
+    CenteralController(const CenteralControllerParams &params);
+    virtual void startup() override;
+
+    void setAsyncMode() { mode = ProcessingMode::ASYNCHRONOUS; }
+    void setBSPMode() { mode = ProcessingMode::BULK_SYNCHRONOUS; }
+
+    void createPopCountDirectory(int atoms_per_block);
+
+    void createBFSWorkload(Addr init_addr, uint32_t init_value);
+    void createBFSVisitedWorkload(Addr init_addr, uint32_t init_value);
+    void createSSSPWorkload(Addr init_addr, uint32_t init_value);
+    void createCCWorkload();
+    void createAsyncPRWorkload(float alpha, float threshold);
+    void createPRWorkload(int num_nodes, float alpha);
+    void createBCWorkload(Addr init_addr, uint32_t init_value);
+
+    bool bufferRemoteUpdate(int slice_number, PacketPtr pkt);
+    int getnumGPTs() {return mpuVector.size();}
+
+    void recvDoneSignal();
+
+    int workCount();
+    float getPRError();
+    void printAnswerToHostSimout();
+    std::unordered_map<MPU*, std::unordered_map<int, std::deque<PacketPtr>>> 
+                                                                remoteUpdates; 
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
new file mode 100644
index 0000000000..8c38341f48
--- /dev/null
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -0,0 +1,1275 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/coalesce_engine.hh"
+
+#include <bitset>
+
+#include "accl/graph/sega/mpu.hh"
+#include "base/intmath.hh"
+#include "debug/CacheBlockState.hh"
+#include "debug/CoalesceEngine.hh"
+#include "debug/SEGAStructureSize.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+CoalesceEngine::CoalesceEngine(const Params &params):
+    BaseMemoryEngine(params), mode(ProcessingMode::NOT_SET), lastAtomAddr(0),
+    numLines((int) (params.cache_size / peerMemoryAtomSize)),
+    numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
+    onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle),
+    pullsReceived(0), pullsScheduled(0),
+    pendingPullLimit(params.pending_pull_limit),
+    pendingPullReads(0), activeBufferSize(params.active_buffer_size),
+    postPushWBQueueSize(params.post_push_wb_queue_size),
+    nextMemoryEvent([this] {
+        processNextMemoryEvent();
+        }, name() + ".nextMemoryEvent"),
+    nextResponseEvent([this] {
+        processNextResponseEvent();
+        }, name() + ".nextResponseEvent"),
+    nextApplyEvent([this] {
+        processNextApplyEvent();
+        }, name() + ".nextApplyEvent"),
+    nextDoneSignalEvent([this] {
+        processNextDoneSignalEvent();
+        }, name() + ".nextDoneSignalEvent"),
+    stats(*this)
+{
+    assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
+    cacheBlocks = new Block [numLines];
+    for (int i = 0; i < numLines; i++) {
+        cacheBlocks[i] = Block(numElementsPerLine);
+    }
+    currentActiveCacheBlocks = UniqueFIFO<int>(numLines);
+    futureActiveCacheBlocks = UniqueFIFO<int>(numLines);
+
+    activeBuffer.clear();
+    postPushWBQueue.clear();
+}
+
+void
+CoalesceEngine::registerMPU(MPU* mpu)
+{
+    owner = mpu;
+}
+
+
+// NOTE: Used for initializing memory and reading the final answer
+void
+CoalesceEngine::recvFunctional(PacketPtr pkt)
+{
+    if (pkt->isRead()) {
+        assert(pkt->getSize() == peerMemoryAtomSize);
+        Addr addr = pkt->getAddr();
+        int block_index = getBlockIndex(addr);
+
+        if ((cacheBlocks[block_index].addr == addr) &&
+            (cacheBlocks[block_index].valid)) {
+            assert(cacheBlocks[block_index].state == CacheState::IDLE);
+
+            pkt->makeResponse();
+            pkt->setDataFromBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+        } else {
+            memPort.sendFunctional(pkt);
+        }
+    } else {
+        graphWorkload->init(pkt, currentDirectory);
+        if (pkt->getAddr() > lastAtomAddr) {
+            lastAtomAddr = pkt->getAddr();
+        }
+        memPort.sendFunctional(pkt);
+    }
+}
+
+void
+CoalesceEngine::postMemInitSetup()
+{
+    currentDirectory->setLastAtomAddr(lastAtomAddr);
+}
+
+void
+CoalesceEngine::postConsumeProcess()
+{
+    Addr last_local_atom_addr = peerMemoryRange.removeIntlvBits(lastAtomAddr);
+    for (Addr local_addr = 0; local_addr <= last_local_atom_addr; local_addr += peerMemoryAtomSize) {
+        Addr addr = peerMemoryRange.addIntlvBits(local_addr);
+        int block_index = getBlockIndex(addr);
+        if (cacheBlocks[block_index].addr == addr) {
+            assert(cacheBlocks[block_index].valid);
+            assert(!cacheBlocks[block_index].hasConflict);
+            assert(cacheBlocks[block_index].state == CacheState::IDLE);
+            bool atom_active_future_before = false;
+            bool atom_active_future_after = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                assert(!cacheBlocks[block_index].items[index].activeNow);
+                atom_active_future_before |= cacheBlocks[block_index].items[index].activeFuture;
+                graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]);
+                atom_active_future_after |= cacheBlocks[block_index].items[index].activeFuture;
+                if (cacheBlocks[block_index].items[index].activeFuture) {
+                    cacheBlocks[block_index].items[index].activeFuture = false;
+                    cacheBlocks[block_index].items[index].activeNow = true;
+                    cacheBlocks[block_index].dirty = true;
+                }
+            }
+            if (!atom_active_future_before && atom_active_future_after) {
+                futureActiveCacheBlocks.push_back(block_index);
+            }
+            if (atom_active_future_before && !atom_active_future_after) {
+                futureActiveCacheBlocks.erase(block_index);
+            }
+        } else {
+            WorkListItem items[numElementsPerLine];
+            PacketPtr read_pkt = createReadPacket(addr, peerMemoryAtomSize);
+            memPort.sendFunctional(read_pkt);
+            read_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            bool atom_active_future_before = false;
+            bool atom_active_future_after = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                assert(!items[index].activeNow);
+                atom_active_future_before |= items[index].activeFuture;
+                graphWorkload->interIterationInit(items[index]);
+                atom_active_future_after |= items[index].activeFuture;
+                if (items[index].activeFuture) {
+                    items[index].activeFuture = false;
+                    items[index].activeNow = true;
+                }
+            }
+            if (!atom_active_future_before && atom_active_future_after) {
+                futureDirectory->activate(addr);
+            }
+            if (atom_active_future_before && !atom_active_future_after) {
+                futureDirectory->deactivate(addr);
+            }
+            PacketPtr write_pkt = createWritePacket(addr, peerMemoryAtomSize, (uint8_t*) items);
+            memPort.sendFunctional(write_pkt);
+            delete read_pkt;
+            delete write_pkt;
+        }
+    }
+}
+
+void
+CoalesceEngine::createAsyncPopCountDirectory(int atoms_per_block)
+{
+    currentDirectory = new PopCountDirectory(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+    futureDirectory = nullptr;
+}
+
+void
+CoalesceEngine::createBSPPopCountDirectory(int atoms_per_block)
+{
+    currentDirectory = new PopCountDirectory(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+    futureDirectory = new PopCountDirectory(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+}
+
+void
+CoalesceEngine::swapDirectories()
+{
+    assert(currentDirectory->empty());
+    assert(currentActiveCacheBlocks.empty());
+    // assert currentDirectory is empty
+    WorkDirectory* temp = currentDirectory;
+    currentDirectory = futureDirectory;
+    futureDirectory = temp;
+
+    currentActiveCacheBlocks.clear();
+    currentActiveCacheBlocks = futureActiveCacheBlocks;
+    futureActiveCacheBlocks.clear();
+}
+
+bool
+CoalesceEngine::done()
+{
+    return memoryFunctionQueue.empty() && currentActiveCacheBlocks.empty() &&
+        activeBuffer.empty() && currentDirectory->empty() && (onTheFlyReqs == 0);
+}
+
+bool
+CoalesceEngine::enoughSpace()
+{
+    return (activeBuffer.size() + pendingPullReads + pullsScheduled) < activeBufferSize;
+}
+
+bool
+CoalesceEngine::pullCondition()
+{
+    bool enough_space = enoughSpace();
+    bool schedule_limit = pullsScheduled < pendingPullLimit;
+    return enough_space && schedule_limit;
+}
+
+// addr should be aligned to peerMemoryAtomSize
+int
+CoalesceEngine::getBlockIndex(Addr addr)
+{
+    assert((addr % peerMemoryAtomSize) == 0);
+    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
+    return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
+}
+
+ReadReturnStatus
+CoalesceEngine::recvWLRead(Addr addr)
+{
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
+    assert(aligned_addr % peerMemoryAtomSize == 0);
+    int block_index = getBlockIndex(aligned_addr);
+    assert(block_index < numLines);
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+    assert(wl_offset < numElementsPerLine);
+    DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
+                        "This request maps to cacheBlocks[%d], aligned_addr: "
+                        "%lu, and wl_offset: %d.\n", __func__, addr,
+                        block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
+
+    if ((cacheBlocks[block_index].addr == aligned_addr) &&
+        (cacheBlocks[block_index].valid)) {
+        // Hit
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
+        stats.readHits++;
+        assert(cacheBlocks[block_index].state != CacheState::INVALID);
+        responseQueue.push_back(std::make_tuple(
+            addr, cacheBlocks[block_index].items[wl_offset], curTick()));
+
+        DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                        cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
+        DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
+        // TODO: Stat to count the number of WLItems that have been touched.
+        cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+        cacheBlocks[block_index].state = CacheState::BUSY;
+        // HACK: If a read happens on the same cycle as another operation such
+        // as apply set lastChangedTick to half a cycle later so that operation
+        // scheduled by the original operation (apply in this example) are
+        // invalidated. For more details refer to "accl/graph/sega/busyMaskErr"
+        cacheBlocks[block_index].lastChangedTick =
+                                    curTick() + (Tick) (clockPeriod() / 2);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+
+        if (!nextResponseEvent.scheduled()) {
+            schedule(nextResponseEvent, nextCycle());
+        }
+        stats.numVertexReads++;
+        return ReadReturnStatus::ACCEPT;
+    } else if ((cacheBlocks[block_index].addr == aligned_addr) &&
+                (cacheBlocks[block_index].state == CacheState::PENDING_DATA)) {
+        // Hit under miss
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit under miss.\n",
+                                                        __func__, addr);
+        stats.readHitUnderMisses++;
+        assert(!cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(!cacheBlocks[block_index].dirty);
+
+        assert(MSHR.find(block_index) != MSHR.end());
+        MSHR[block_index].push_back(addr);
+        DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to MSHR "
+                "for cacheBlocks[%d].\n", __func__, addr, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+        stats.numVertexReads++;
+        return ReadReturnStatus::ACCEPT;
+    } else {
+        // miss
+        assert(cacheBlocks[block_index].addr != aligned_addr);
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
+        stats.readMisses++;
+        if (cacheBlocks[block_index].state != CacheState::INVALID) {
+            // conflict miss
+            DPRINTF(CoalesceEngine, "%s: Addr: %lu has conflict with "
+                "Addr: %lu.\n", __func__, addr, cacheBlocks[block_index].addr);
+            cacheBlocks[block_index].hasConflict = true;
+            if (cacheBlocks[block_index].state == CacheState::IDLE) {
+                if (cacheBlocks[block_index].dirty) {
+                    cacheBlocks[block_index].state = CacheState::PENDING_WB;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
+                    memoryFunctionQueue.emplace_back(
+                        [this] (int block_index, Tick schedule_tick) {
+                            processNextWriteBack(block_index, schedule_tick);
+                        }, block_index, curTick());
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
+                    }
+                } else {
+                    // NOTE: The cache block could still be active but
+                    // not dirty. If active we only have to active tracking
+                    // but can throw the data away.
+                    bool atom_active_now = false;
+                    bool atom_active_future = false;
+                    for (int index = 0; index < numElementsPerLine; index++) {
+                        atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                        atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+                    }
+                    if (atom_active_now) {
+                        currentActiveCacheBlocks.erase(block_index);
+                        int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                        stats.currentFrontierSize.sample(currentDirectory->workCount());
+                        stats.currentBlockActiveCount.sample(count);
+                    }
+                    if (atom_active_future) {
+                        futureActiveCacheBlocks.erase(block_index);
+                        int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                        stats.futureFrontierSize.sample(futureDirectory->workCount());
+                        stats.futureBlockActiveCount.sample(count);
+                    }
+                    // NOTE: Bring the cache line to invalid state.
+                    // NOTE: Above line where we set hasConflict to true
+                    // does not matter anymore since we reset the cache line.
+                    cacheBlocks[block_index].reset();
+                }
+                return ReadReturnStatus::REJECT_NO_ROLL;
+            } else {
+                stats.numConflicts++;
+                return ReadReturnStatus::REJECT_ROLL;
+            }
+        } else {
+            // cold miss
+            assert(MSHR.find(block_index) == MSHR.end());
+            cacheBlocks[block_index].addr = aligned_addr;
+            cacheBlocks[block_index].busyMask = 0;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].dirty = false;
+            cacheBlocks[block_index].hasConflict = false;
+            cacheBlocks[block_index].state = CacheState::PENDING_DATA;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+
+            MSHR[block_index].push_back(addr);
+            memoryFunctionQueue.emplace_back(
+                [this] (int block_index, Tick schedule_tick) {
+                    processNextRead(block_index, schedule_tick);
+                }, block_index, curTick());
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
+            }
+            return ReadReturnStatus::ACCEPT;
+        }
+    }
+}
+
+bool
+CoalesceEngine::handleMemResp(PacketPtr pkt)
+{
+    assert(pkt->isResponse());
+    DPRINTF(CoalesceEngine,  "%s: Received packet: %s from memory.\n",
+                                                __func__, pkt->print());
+
+    onTheFlyReqs--;
+    if (pkt->isWrite()) {
+        DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__);
+        delete pkt;
+    } else {
+        assert(pkt->isRead());
+        Addr addr = pkt->getAddr();
+        int block_index = getBlockIndex(addr);
+        ReadPurpose* purpose = pkt->findNextSenderState<ReadPurpose>();
+        // TODO: delete purpose
+
+        // NOTE: Regardless of where the pkt will go we have to release the
+        // reserved space for this pkt in the activeBuffer in case
+        // it was read from memory for placement in the activeBuffer.
+        // NOTE: Also we have to stop tracking the address for pullAddrs
+        if (purpose->dest() == ReadDestination::READ_FOR_PUSH) {
+            pendingPullReads--;
+            pendingPullAddrs.erase(addr);
+        }
+        if (cacheBlocks[block_index].addr == addr) {
+            // If it is in the cache, line should be in PENDING_DATA state.
+            // Regardless of the purpose for which it was read, it should
+            // be placed in the cache array.
+            assert(cacheBlocks[block_index].busyMask == 0);
+            assert(!cacheBlocks[block_index].valid);
+            assert(!cacheBlocks[block_index].dirty);
+            assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA);
+
+            // NOTE: Since it is in PENDING_DATA state it
+            // should have an entry in the MSHR.
+            assert(MSHR.find(block_index) != MSHR.end());
+
+            pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
+                                                            peerMemoryAtomSize);
+
+            cacheBlocks[block_index].valid = true;
+            // HACK: In case the pkt was read for push but it was allocated
+            // for in the cache later on, we should cancel the future
+            // processNextRead for this block. We could set lastChangedTick
+            // to curTick() like usual. However, there is no way to ensure
+            // that processNextRead will be not be called on the same tick
+            // as the pkt arrives from the memory. Therefore, we will set
+            // the lastChangedTick to half a cycle before the actual time.
+            // We move that back in time because it would be fine if
+            // processNextRead happened before pkt arriveed. processNextRead
+            // actually will check if there is a pending read for push for
+            // the address it's trying to populate.
+            if (purpose->dest() == ReadDestination::READ_FOR_PUSH) {
+                cacheBlocks[block_index].lastChangedTick =
+                                    curTick() - (Tick) (clockPeriod() / 2);
+            } else {
+                cacheBlocks[block_index].lastChangedTick = curTick();
+            }
+
+            // NOTE: If the atom is active we have to deactivate the tracking
+            // of this atom in the memory since it's not in memory anymore.
+            // Since it is going to the cache, cache will be responsible for
+            // tracking this. Push to activeCacheBlocks for simulator speed
+            // instead of having to search for active blocks in the cache.
+            bool atom_active_now = false;
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_now) {
+                int count = currentDirectory->deactivate(addr);
+                currentActiveCacheBlocks.push_back(block_index);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.currentBlockActiveCount.sample(count);
+            }
+            if (atom_active_future) {
+                int count = futureDirectory->deactivate(addr);
+                futureActiveCacheBlocks.push_back(block_index);
+                stats.futureFrontierSize.sample(futureDirectory->workCount());
+                stats.futureBlockActiveCount.sample(count);
+            }
+
+            assert(MSHR.find(block_index) != MSHR.end());
+            for (auto it = MSHR[block_index].begin();
+                                            it != MSHR[block_index].end();) {
+                Addr miss_addr = *it;
+                Addr aligned_miss_addr =
+                            roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+
+                assert(aligned_miss_addr == cacheBlocks[block_index].addr);
+                int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+                DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                            "cacheBlocks[%d] can be serviced with the received "
+                            "packet.\n",__func__, miss_addr, block_index);
+                responseQueue.push_back(std::make_tuple(miss_addr,
+                        cacheBlocks[block_index].items[wl_offset], curTick()));
+                DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, miss_addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+                DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                            block_index, cacheBlocks[block_index].to_string());
+                it = MSHR[block_index].erase(it);
+            }
+            MSHR.erase(block_index);
+
+            cacheBlocks[block_index].state = CacheState::BUSY;
+            if ((!nextResponseEvent.scheduled()) && (!responseQueue.empty())) {
+                schedule(nextResponseEvent, nextCycle());
+            }
+            delete pkt;
+        } else {
+            assert(purpose->dest() == ReadDestination::READ_FOR_PUSH);
+            // There should be enough room in activeBuffer to place this pkt.
+            // REMEMBER: If dest == READ_FOR_PUSH we release the reserved space.
+            // So at this point in code we should have at least one free entry
+            // in the active buffer which is reserved for this pkt.
+            assert(activeBuffer.size() + pendingPullReads < activeBufferSize);
+
+            WorkListItem items[numElementsPerLine];
+            pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            bool atom_active_now = false;
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                atom_active_now |= items[index].activeNow;
+                atom_active_future |= items[index].activeFuture;
+            }
+            if (atom_active_now) {
+                int count = currentDirectory->deactivate(addr);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.currentBlockActiveCount.sample(count);
+                if (atom_active_future) {
+                    int count = futureDirectory->deactivate(addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.futureBlockActiveCount.sample(count);
+                }
+                activeBuffer.emplace_back(pkt, curTick());
+            } else {
+                stats.wastefulBytesRead += pkt->getSize();
+                delete pkt;
+            }
+
+            if (pullCondition()) {
+                memoryFunctionQueue.emplace_back(
+                    [this] (int ignore, Tick schedule_tick) {
+                        processNextVertexPull(ignore, schedule_tick);
+                    }, 0, curTick());
+                if ((!nextMemoryEvent.pending()) &&
+                    (!nextMemoryEvent.scheduled())) {
+                    schedule(nextMemoryEvent, nextCycle());
+                }
+                pullsScheduled++;
+            }
+        }
+        delete purpose;
+    }
+
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
+    }
+    return true;
+}
+
+void
+CoalesceEngine::processNextResponseEvent()
+{
+    int num_responses_sent = 0;
+
+    Addr addr_response;
+    WorkListItem worklist_response;
+    Tick response_queueing_tick;
+    while(true) {
+        std::tie(addr_response, worklist_response, response_queueing_tick) =
+                                                        responseQueue.front();
+        Tick waiting_ticks = curTick() - response_queueing_tick;
+        if (ticksToCycles(waiting_ticks) < 1) {
+            break;
+        }
+        owner->handleIncomingWL(addr_response, worklist_response);
+        num_responses_sent++;
+        DPRINTF(CoalesceEngine,
+                    "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
+                    __func__,
+                    graphWorkload->printWorkListItem(worklist_response),
+                    addr_response);
+
+        responseQueue.pop_front();
+        DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue."
+                    " responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
+                    "responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        stats.responseQueueLatency.sample(
+                                    waiting_ticks * 1e9 / getClockFrequency());
+        if (num_responses_sent >= maxRespPerCycle) {
+            // TODO: Add the condition to check that front of queue can be
+            // sent to WLEngine. i.e. it has at least been in the queue for
+            // one cycle.
+            if (!responseQueue.empty()) {
+                stats.responsePortShortage++;
+            }
+            break;
+        }
+        if (responseQueue.empty()) {
+            break;
+        }
+    }
+
+    if ((!nextResponseEvent.scheduled()) &&
+        (!responseQueue.empty())) {
+        schedule(nextResponseEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
+{
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
+    int block_index = getBlockIndex(aligned_addr);
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+    DPRINTF(CoalesceEngine,  "%s: Received a write request for addr: %lu with "
+                        "wl: %s. This request maps to cacheBlocks[%d], "
+                        "aligned_addr: %lu, and wl_offset: %d.\n",
+                        __func__, addr, graphWorkload->printWorkListItem(wl),
+                        block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
+                "with Addr: %lu.\n", __func__,
+                graphWorkload->printWorkListItem(wl), addr);
+
+    // NOTE: Design does not allow for write misses.
+    assert(cacheBlocks[block_index].addr == aligned_addr);
+    // cache state asserts
+    assert(cacheBlocks[block_index].busyMask != 0);
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].state == CacheState::BUSY);
+
+    // respective bit in busyMask for wl is set.
+    assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
+            (1 << wl_offset));
+
+    if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) {
+        cacheBlocks[block_index].dirty |= true;
+    }
+
+    bool active = graphWorkload->activeCondition(wl, cacheBlocks[block_index].items[wl_offset]);
+    cacheBlocks[block_index].items[wl_offset] = wl;
+    if (mode == ProcessingMode::ASYNCHRONOUS) {
+        cacheBlocks[block_index].items[wl_offset].activeNow |= active;
+        if (active && (!currentActiveCacheBlocks.find(block_index))) {
+            currentActiveCacheBlocks.push_back(block_index);
+            if (!owner->running()) {
+                owner->start();
+            }
+        }
+    }
+    if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        cacheBlocks[block_index].items[wl_offset].activeFuture |= active;
+        if (active && (!futureActiveCacheBlocks.find(block_index))) {
+            futureActiveCacheBlocks.push_back(block_index);
+        }
+    }
+
+    cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
+    cacheBlocks[block_index].lastChangedTick = curTick();
+    DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
+                __func__, block_index, wl_offset,
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]));
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
+
+    if (cacheBlocks[block_index].busyMask == 0) {
+        if (cacheBlocks[block_index].hasConflict) {
+            if (cacheBlocks[block_index].dirty) {
+                cacheBlocks[block_index].state = CacheState::PENDING_WB;
+                cacheBlocks[block_index].lastChangedTick = curTick();
+                memoryFunctionQueue.emplace_back(
+                    [this] (int block_index, Tick schedule_tick) {
+                        processNextWriteBack(block_index, schedule_tick);
+                    }, block_index, curTick());
+                if ((!nextMemoryEvent.pending()) &&
+                    (!nextMemoryEvent.scheduled())) {
+                    schedule(nextMemoryEvent, nextCycle());
+                }
+            } else {
+                bool atom_active_now = false;
+                bool atom_active_future = false;
+                for (int index = 0; index < numElementsPerLine; index++) {
+                    atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                    atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+                }
+                if (atom_active_now) {
+                    currentActiveCacheBlocks.erase(block_index);
+                    int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                    stats.currentFrontierSize.sample(currentDirectory->workCount());
+                    stats.currentBlockActiveCount.sample(count);
+                }
+                if (atom_active_future) {
+                    futureActiveCacheBlocks.erase(block_index);
+                    int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.futureBlockActiveCount.sample(count);
+                }
+                cacheBlocks[block_index].reset();
+            }
+        } else {
+            cacheBlocks[block_index].state = CacheState::IDLE;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+        }
+    }
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    stats.numVertexWrites++;
+
+    if ((cacheBlocks[block_index].state == CacheState::IDLE) &&
+        done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextMemoryEvent()
+{
+    if (memPort.blocked()) {
+        stats.numMemoryBlocks++;
+        nextMemoryEvent.sleep();
+        return;
+    }
+
+    DPRINTF(CoalesceEngine, "%s: Processing another "
+                        "memory function.\n", __func__);
+    std::function<void(int, Tick)> next_memory_function;
+    int next_memory_function_input;
+    Tick next_memory_function_tick;
+    std::tie(
+        next_memory_function,
+        next_memory_function_input,
+        next_memory_function_tick) = memoryFunctionQueue.front();
+    next_memory_function(next_memory_function_input, next_memory_function_tick);
+    memoryFunctionQueue.pop_front();
+    stats.memoryFunctionLatency.sample((curTick() - next_memory_function_tick)
+                                                * 1e9 / getClockFrequency());
+    DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
+                                "memoryFunctionQueue.size = %d.\n", __func__,
+                                memoryFunctionQueue.size());
+
+    assert(!nextMemoryEvent.pending());
+    assert(!nextMemoryEvent.scheduled());
+    if ((!memoryFunctionQueue.empty())) {
+        schedule(nextMemoryEvent, nextCycle());
+    }
+
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
+{
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n",
+                                            __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+        __func__, block_index, cacheBlocks[block_index].to_string());
+    // A cache block should not be touched while it's waiting for data.
+    // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
+    // TODO: Figure out if this is still necessary.
+    if (cacheBlocks[block_index].lastChangedTick != schedule_tick) {
+        return;
+    }
+
+    assert(cacheBlocks[block_index].busyMask == 0);
+    assert(!cacheBlocks[block_index].valid);
+    assert(!cacheBlocks[block_index].dirty);
+    assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA);
+
+    bool need_send_pkt = true;
+
+    // NOTE: Search postPushWBQueue
+    for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end();)
+    {
+        PacketPtr wb_pkt = std::get<0>(*wb);
+        if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) {
+            wb_pkt->writeDataToBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+            cacheBlocks[block_index].valid = true;
+            cacheBlocks[block_index].dirty = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            // NOTE: If an atom is in the postPushWBQueue,
+            // the it is definitely currently not active.
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++)
+            {
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                futureActiveCacheBlocks.push_back(block_index);
+            }
+
+            need_send_pkt = false;
+            wb = postPushWBQueue.erase(wb);
+            delete wb_pkt;
+        } else {
+            wb++;
+        }
+    }
+    // NOTE: Search activeBuffer
+    for (auto ab = activeBuffer.begin(); ab != activeBuffer.end();) {
+        PacketPtr ab_pkt = std::get<0>(*ab);
+        if (cacheBlocks[block_index].addr == ab_pkt->getAddr()) {
+            ab_pkt->writeDataToBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+
+            cacheBlocks[block_index].valid = true;
+            cacheBlocks[block_index].dirty = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            // If an atom is in the activeBuffer,
+            // then it is definitely currently active.
+            currentActiveCacheBlocks.push_back(block_index);
+            // NOTE: Residence in the activeBuffer does not
+            // signify anything about future activity.
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++)
+            {
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                futureActiveCacheBlocks.push_back(block_index);
+            }
+
+            need_send_pkt = false;
+            ab = activeBuffer.erase(ab);
+            delete ab_pkt;
+            if (pullCondition()) {
+                memoryFunctionQueue.emplace_back(
+                    [this] (int ignore, Tick schedule_tick) {
+                        processNextVertexPull(ignore, schedule_tick);
+                    }, 0, curTick());
+                pullsScheduled++;
+            }
+        } else {
+            ab++;
+        }
+    }
+    if (!need_send_pkt) {
+        for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+            Addr miss_addr = *it;
+            Addr aligned_miss_addr =
+                roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+            assert(aligned_miss_addr == cacheBlocks[block_index].addr);
+            int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                        "cacheBlocks[%d] can be serviced with the received "
+                        "packet.\n",__func__, miss_addr, block_index);
+            // TODO: Make this block of code into a function
+            responseQueue.push_back(std::make_tuple(miss_addr,
+                    cacheBlocks[block_index].items[wl_offset], curTick()));
+            DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, miss_addr,
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
+                        responseQueue.size());
+            DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, miss_addr,
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
+                        responseQueue.size());
+            cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+                    __func__, block_index,
+                    cacheBlocks[block_index].to_string());
+            it = MSHR[block_index].erase(it);
+        }
+        assert(MSHR[block_index].empty());
+        MSHR.erase(block_index);
+        if ((!nextResponseEvent.scheduled()) &&
+            (!responseQueue.empty())) {
+            schedule(nextResponseEvent, nextCycle());
+        }
+        cacheBlocks[block_index].state = CacheState::BUSY;
+    }
+
+    if (pendingPullAddrs.find(cacheBlocks[block_index].addr) !=
+                                            pendingPullAddrs.end()) {
+        need_send_pkt = false;
+    }
+
+    if (need_send_pkt) {
+        PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
+                                        peerMemoryAtomSize);
+        ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_CACHE);
+        pkt->pushSenderState(purpose);
+        DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
+                "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
+        memPort.sendPacket(pkt);
+        onTheFlyReqs++;
+    }
+}
+
+void
+CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
+{
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n",
+                                                __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+
+    if (schedule_tick == cacheBlocks[block_index].lastChangedTick) {
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].dirty);
+        assert(cacheBlocks[block_index].hasConflict);
+        assert(cacheBlocks[block_index].state == CacheState::PENDING_WB);
+
+        // NOTE: If the atom we're writing back is active, we have to
+        // stop tracking it in the cache and start tracking it in the memory.
+        bool atom_active_now = false;
+        bool atom_active_future = false;
+        for (int index = 0; index < numElementsPerLine; index++) {
+            atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+            atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+        }
+
+        PacketPtr pkt = createWritePacket(
+                cacheBlocks[block_index].addr, peerMemoryAtomSize,
+                (uint8_t*) cacheBlocks[block_index].items);
+        DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
+                        "Addr: %lu, size = %d.\n", __func__,
+                        pkt->getAddr(), pkt->getSize());
+        if (atom_active_future) {
+            futureActiveCacheBlocks.erase(block_index);
+        }
+        if (atom_active_now) {
+            currentActiveCacheBlocks.erase(block_index);
+            if (enoughSpace()) {
+                activeBuffer.emplace_back(pkt, curTick());
+            } else {
+                int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.currentBlockActiveCount.sample(count);
+                if (atom_active_future) {
+                    int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.futureBlockActiveCount.sample(count);
+                }
+                memPort.sendPacket(pkt);
+                onTheFlyReqs++;
+            }
+        } else {
+            if (atom_active_future) {
+                int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                stats.futureFrontierSize.sample(futureDirectory->workCount());
+                stats.futureBlockActiveCount.sample(count);
+            }
+            memPort.sendPacket(pkt);
+            onTheFlyReqs++;
+        }
+        cacheBlocks[block_index].reset();
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+    } else {
+        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a "
+                            "write back has been scheduled for it. Ignoring "
+                            "the current write back scheduled at tick %lu for "
+                            "the right function scheduled later.\n",
+                            __func__, block_index, schedule_tick);
+    }
+}
+
+void
+CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
+{
+    if (!postPushWBQueue.empty()) {
+        PacketPtr wb_pkt;
+        Tick pkt_tick;
+        std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
+        if (schedule_tick == pkt_tick) {
+            WorkListItem items[numElementsPerLine];
+            wb_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                atom_active_future |= items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                futureDirectory->activate(wb_pkt->getAddr());
+            }
+            memPort.sendPacket(wb_pkt);
+            onTheFlyReqs++;
+            postPushWBQueue.pop_front();
+        }
+    }
+}
+
+void
+CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
+{
+    DPRINTF(CoalesceEngine, "%s: processNextVertexPull called.\n", __func__);
+    pullsScheduled--;
+    if (!currentDirectory->empty()) {
+        Addr addr = currentDirectory->getNextWork();
+        int block_index = getBlockIndex(addr);
+
+        bool in_cache = cacheBlocks[block_index].addr == addr;
+        bool in_active_buffer = false;
+        for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) {
+            PacketPtr pkt = std::get<0>(*ab);
+            in_active_buffer |= (pkt->getAddr() == addr);
+        }
+        bool in_write_buffer = false;
+        for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
+        {
+            PacketPtr pkt = std::get<0>(*wb);
+            in_write_buffer |= (pkt->getAddr() == addr);
+        }
+        bool repeat_work = pendingPullAddrs.find(addr) != pendingPullAddrs.end();
+
+        if (!in_cache && !in_active_buffer && !in_write_buffer && !repeat_work) {
+            PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
+            ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_PUSH);
+            pkt->pushSenderState(purpose);
+            memPort.sendPacket(pkt);
+            onTheFlyReqs++;
+            pendingPullReads++;
+            pendingPullAddrs.insert(addr);
+        }
+    }
+}
+
+void
+CoalesceEngine::recvMemRetry()
+{
+    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
+
+    if (!nextMemoryEvent.pending()) {
+        DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
+        return;
+    }
+    assert(!nextMemoryEvent.scheduled());
+    nextMemoryEvent.wake();
+    schedule(nextMemoryEvent, nextCycle());
+}
+
+int
+CoalesceEngine::workCount()
+{
+    return currentActiveCacheBlocks.size() + currentDirectory->workCount() + activeBuffer.size();
+}
+
+void
+CoalesceEngine::recvVertexPull()
+{
+    pullsReceived++;
+    DPRINTF(CoalesceEngine, "%s: Received a vertex pull. pullsReceived: %d.\n", __func__, pullsReceived);
+
+    stats.verticesPulled++;
+    stats.lastVertexPullTime = curTick() - stats.lastResetTick;
+    if (!nextApplyEvent.scheduled()) {
+        schedule(nextApplyEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextApplyEvent()
+{
+    if ((!activeBuffer.empty()) &&
+        (postPushWBQueue.size() < postPushWBQueueSize)) {
+        PacketPtr pkt;
+        Tick entrance_tick;
+        WorkListItem items[numElementsPerLine];
+
+        std::tie(pkt, entrance_tick) = activeBuffer.front();
+        pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+
+        for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
+            if (items[index].activeNow) {
+                Addr addr = pkt->getAddr() + index * sizeof(WorkListItem);
+                uint32_t delta = graphWorkload->apply(items[index]);
+                items[index].activeNow = false;
+                owner->recvVertexPush(addr, delta, items[index].edgeIndex,
+                                                    items[index].degree);
+                pullsReceived--;
+                stats.verticesPushed++;
+                stats.lastVertexPushTime = curTick() - stats.lastResetTick;
+            }
+        }
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
+
+        bool atom_active_now = false;
+        for (int index = 0; index < numElementsPerLine; index++) {
+            atom_active_now |= items[index].activeNow;
+        }
+        // NOTE: If the atom is not active anymore.
+        if (!atom_active_now) {
+            PacketPtr wb_pkt = createWritePacket(pkt->getAddr(),
+                                        peerMemoryAtomSize, (uint8_t*) items);
+            postPushWBQueue.emplace_back(wb_pkt, curTick());
+            activeBuffer.pop_front();
+            memoryFunctionQueue.emplace_back(
+                [this] (int ignore, Tick schedule_tick) {
+                    processNextPostPushWB(ignore, schedule_tick);
+                }, 0, curTick());
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
+            }
+            delete pkt;
+        }
+    } else if (!currentActiveCacheBlocks.empty()) {
+        int num_visited_indices = 0;
+        int initial_fifo_length = currentActiveCacheBlocks.size();
+        while (true) {
+            int block_index = currentActiveCacheBlocks.front();
+            if (cacheBlocks[block_index].state == CacheState::IDLE) {
+                for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
+                    if (cacheBlocks[block_index].items[index].activeNow) {
+                        Addr addr = cacheBlocks[block_index].addr + index * sizeof(WorkListItem);
+                        uint32_t delta = graphWorkload->apply(cacheBlocks[block_index].items[index]);
+                        cacheBlocks[block_index].items[index].activeNow = false;
+                        cacheBlocks[block_index].dirty = true;
+                        owner->recvVertexPush(addr, delta,
+                            cacheBlocks[block_index].items[index].edgeIndex,
+                            cacheBlocks[block_index].items[index].degree);
+                        pullsReceived--;
+                        stats.verticesPushed++;
+                        stats.lastVertexPushTime = curTick() - stats.lastResetTick;
+                    }
+                }
+
+                bool atom_active_now = false;
+                for (int index = 0; index < numElementsPerLine; index++) {
+                    atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                }
+                // NOTE: If we have reached the last item in the cache block
+                if (!atom_active_now) {
+                    currentActiveCacheBlocks.erase(block_index);
+                }
+                break;
+            }
+            // NOTE: If the block with index at the front of activeCacheBlocks
+            // is not in IDLE state, then roll the that index to the back
+            currentActiveCacheBlocks.pop_front();
+            currentActiveCacheBlocks.push_back(block_index);
+            // NOTE: If we have visited all the items initially in the FIFO.
+            num_visited_indices++;
+            if (num_visited_indices == initial_fifo_length) {
+                break;
+            }
+        }
+    } else {
+        DPRINTF(CoalesceEngine, "%s: Could not find work to apply.\n", __func__);
+        stats.worklessCycles++;
+    }
+
+    if (pullCondition()) {
+        memoryFunctionQueue.emplace_back(
+            [this] (int ignore, Tick schedule_tick) {
+                processNextVertexPull(ignore, schedule_tick);
+            }, 0, curTick());
+        if ((!nextMemoryEvent.pending()) &&
+            (!nextMemoryEvent.scheduled())) {
+            schedule(nextMemoryEvent, nextCycle());
+        }
+        pullsScheduled++;
+    }
+
+    if ((pullsReceived > 0) && (!nextApplyEvent.scheduled())) {
+        schedule(nextApplyEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextDoneSignalEvent()
+{
+    if (done()) {
+        owner->recvDoneSignal();
+    }
+}
+
+CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
+    : statistics::Group(&_coalesce),
+    coalesce(_coalesce),
+    lastResetTick(0),
+    ADD_STAT(numVertexReads, statistics::units::Count::get(),
+             "Number of memory vertecies read from cache."),
+    ADD_STAT(numVertexWrites, statistics::units::Count::get(),
+             "Number of memory vertecies written to cache."),
+    ADD_STAT(readHits, statistics::units::Count::get(),
+             "Number of cache hits."),
+    ADD_STAT(readMisses, statistics::units::Count::get(),
+             "Number of cache misses."),
+    ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
+             "Number of cache hit under misses."),
+    ADD_STAT(numConflicts, statistics::units::Count::get(),
+             "Number of conflicts raised by reads in the cache."),
+    ADD_STAT(responsePortShortage, statistics::units::Count::get(),
+             "Number of times a response has been "
+             "delayed because of port shortage. "),
+    ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
+             "Number of times memory bandwidth was not available."),
+    ADD_STAT(wastefulBytesRead, statistics::units::Byte::get(),
+             "Number of bytes read that were not used by coalesce engine"),
+    ADD_STAT(verticesPulled, statistics::units::Count::get(),
+             "Number of times a pull request has been sent by PushEngine."),
+    ADD_STAT(verticesPushed, statistics::units::Count::get(),
+             "Number of times a vertex has been pushed to the PushEngine"),
+    ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(),
+             "Time of the last pull request. (Relative to reset_stats)"),
+    ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
+             "Time of the last vertex push. (Relative to reset_stats)"),
+    ADD_STAT(worklessCycles, statistics::units::Count::get(),
+             "cycles the coalesce engine could not find work for apply"),
+    ADD_STAT(hitRate, statistics::units::Ratio::get(),
+             "Hit rate in the cache."),
+    ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which pull requests arrive."),
+    ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which vertices are pushed."),
+    ADD_STAT(currentFrontierSize, statistics::units::Count::get(),
+             "Histogram of the length of the current bitvector."),
+    ADD_STAT(futureFrontierSize, statistics::units::Count::get(),
+             "Histogram of the length of the future bitvector."),
+    ADD_STAT(currentBlockActiveCount, statistics::units::Count::get(),
+             "Histogram of the popCount values in the current directory"),
+    ADD_STAT(futureBlockActiveCount, statistics::units::Count::get(),
+             "Histogram of the popCount values in the future directory"),
+    ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
+             "Histogram of the response latency to WLEngine. (ns)"),
+    ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
+             "Histogram of the latency of processing a memory function.")
+{
+}
+
+void
+CoalesceEngine::CoalesceStats::regStats()
+{
+    using namespace statistics;
+
+    hitRate = (readHits + readHitUnderMisses) /
+                (readHits + readHitUnderMisses + readMisses);
+
+    vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime;
+
+    vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
+
+    currentFrontierSize.init(64);
+    futureFrontierSize.init(64);
+    currentBlockActiveCount.init(64);
+    futureBlockActiveCount.init(64);
+    responseQueueLatency.init(64);
+    memoryFunctionLatency.init(64);
+}
+
+void
+CoalesceEngine::CoalesceStats::resetStats()
+{
+    statistics::Group::resetStats();
+
+    lastResetTick = curTick();
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
new file mode 100644
index 0000000000..10a71a7ef1
--- /dev/null
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
+
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
+#include "accl/graph/sega/enums.hh"
+#include "accl/graph/sega/work_directory.hh"
+#include "base/cprintf.hh"
+#include "base/statistics.hh"
+#include "params/CoalesceEngine.hh"
+
+namespace gem5
+{
+
+class MPU;
+
+class CoalesceEngine : public BaseMemoryEngine
+{
+  private:
+    struct Block
+    {
+        WorkListItem* items;
+        Addr addr;
+        uint64_t busyMask;
+        bool valid;
+        bool dirty;
+        bool hasConflict;
+        CacheState state;
+        Tick lastChangedTick;
+        Block() {}
+        Block(int num_elements):
+          addr(-1),
+          busyMask(0),
+          valid(false),
+          dirty(false),
+          hasConflict(false),
+          state(CacheState::INVALID),
+          lastChangedTick(0)
+        {
+          items = new WorkListItem [num_elements];
+        }
+
+        void reset() {
+            addr = -1;
+            busyMask = 0;
+            valid = false;
+            dirty = false;
+            hasConflict = false;
+            state = CacheState::INVALID;
+            lastChangedTick = 0;
+        }
+
+        std::string to_string() {
+            return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, "
+                "dirty: %s, hasConflict: %s, state: %s, lastChangedTick: %lu}",
+                addr, busyMask, valid ? "true" : "false",
+                dirty ? "true" : "false", hasConflict ? "true" : "false",
+                cacheStateStrings[state], lastChangedTick);
+        }
+    };
+
+    struct ReadPurpose : public Packet::SenderState
+    {
+      ReadDestination _dest;
+      ReadPurpose(ReadDestination dest): _dest(dest) {}
+      ReadDestination dest() { return _dest; }
+    };
+
+    MPU* owner;
+    ProcessingMode mode;
+    WorkDirectory* currentDirectory;
+    WorkDirectory* futureDirectory;
+    GraphWorkload* graphWorkload;
+
+    Addr lastAtomAddr;
+
+    int numLines;
+    int numElementsPerLine;
+    Block* cacheBlocks;
+
+    int onTheFlyReqs;
+    std::unordered_map<int, std::vector<Addr>> MSHR;
+
+    // Response route to WLEngine
+    int maxRespPerCycle;
+    std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
+
+    // Tracking work in cache
+    int pullsReceived;
+    // NOTE: Remember to erase from these upon eviction from cache
+    UniqueFIFO<int> currentActiveCacheBlocks;
+    UniqueFIFO<int> futureActiveCacheBlocks;
+
+    int pullsScheduled;
+    int pendingPullLimit;
+    int pendingPullReads;
+    // A map from addr to sendMask. sendMask determines which bytes to
+    // send for push when getting the read response from memory.
+    std::unordered_set<Addr> pendingPullAddrs;
+
+    int activeBufferSize;
+    int postPushWBQueueSize;
+    std::deque<std::tuple<PacketPtr, Tick>> activeBuffer;
+    std::deque<std::tuple<PacketPtr, Tick>> postPushWBQueue;
+
+    bool enoughSpace();
+    bool pullCondition();
+    int getBlockIndex(Addr addr);
+
+    MemoryEvent nextMemoryEvent;
+    void processNextMemoryEvent();
+    void processNextRead(int block_index, Tick schedule_tick);
+    void processNextWriteBack(int block_index, Tick schedule_tick);
+    void processNextVertexPull(int ignore, Tick schedule_tick);
+    void processNextPostPushWB(int ignore, Tick schedule_tick);
+    std::deque<std::tuple<
+        std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
+
+    EventFunctionWrapper nextResponseEvent;
+    void processNextResponseEvent();
+
+    EventFunctionWrapper nextApplyEvent;
+    void processNextApplyEvent();
+
+    EventFunctionWrapper nextDoneSignalEvent;
+    void processNextDoneSignalEvent();
+
+    struct CoalesceStats : public statistics::Group
+    {
+        CoalesceStats(CoalesceEngine &coalesce);
+
+        virtual void regStats() override;
+
+        virtual void resetStats() override;
+
+        CoalesceEngine &coalesce;
+
+        Tick lastResetTick;
+
+        statistics::Scalar numVertexReads;
+        statistics::Scalar numVertexWrites;
+        statistics::Scalar readHits;
+        statistics::Scalar readMisses;
+        statistics::Scalar readHitUnderMisses;
+        statistics::Scalar numConflicts;
+        statistics::Scalar responsePortShortage;
+        statistics::Scalar numMemoryBlocks;
+        statistics::Scalar wastefulBytesRead;
+        statistics::Scalar verticesPulled;
+        statistics::Scalar verticesPushed;
+        statistics::Scalar lastVertexPullTime;
+        statistics::Scalar lastVertexPushTime;
+        statistics::Scalar worklessCycles;
+
+        statistics::Formula hitRate;
+        statistics::Formula vertexPullBW;
+        statistics::Formula vertexPushBW;
+
+        statistics::Histogram currentFrontierSize;
+        statistics::Histogram futureFrontierSize;
+        statistics::Histogram currentBlockActiveCount;
+        statistics::Histogram futureBlockActiveCount;
+        statistics::Histogram responseQueueLatency;
+        statistics::Histogram memoryFunctionLatency;
+    };
+
+    CoalesceStats stats;
+
+  protected:
+    virtual void recvMemRetry() override;
+    virtual bool handleMemResp(PacketPtr pkt) override;
+
+  public:
+    PARAMS(CoalesceEngine);
+    CoalesceEngine(const Params &params);
+    void registerMPU(MPU* mpu);
+
+    void setProcessingMode(ProcessingMode _mode) { mode = _mode; }
+    void createAsyncPopCountDirectory(int atoms_per_block);
+    void createBSPPopCountDirectory(int atoms_per_block);
+    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
+
+    virtual void recvFunctional(PacketPtr pkt);
+    void postMemInitSetup();
+    void postConsumeProcess();
+    void swapDirectories();
+
+    ReadReturnStatus recvWLRead(Addr addr);
+    void recvWLWrite(Addr addr, WorkListItem wl);
+
+    int getSliceSize() 
+                    {return (int)(params().cache_size); }
+                    // /sizeof(WorkListItem)); }
+
+    int workCount();
+    int futureWorkCount();
+    void recvVertexPull();
+
+    bool done();
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
diff --git a/src/accl/graph/sega/coalesce_engine_s.cc b/src/accl/graph/sega/coalesce_engine_s.cc
new file mode 100644
index 0000000000..6a5261d38c
--- /dev/null
+++ b/src/accl/graph/sega/coalesce_engine_s.cc
@@ -0,0 +1,1223 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/coalesce_engine.hh"
+
+#include <bitset>
+
+#include "accl/graph/sega/mpu.hh"
+#include "base/intmath.hh"
+#include "debug/CacheBlockState.hh"
+#include "debug/CoalesceEngine.hh"
+#include "debug/SEGAStructureSize.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+CoalesceEngine::CoalesceEngine(const Params &params):
+    BaseMemoryEngine(params), mode(ProcessingMode::NOT_SET), lastAtomAddr(0),
+    numLines((int) (params.cache_size / peerMemoryAtomSize)),
+    numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
+    onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle),
+    pullsReceived(0), pullsScheduled(0),
+    pendingPullLimit(params.pending_pull_limit),
+    pendingPullReads(0), activeBufferSize(params.active_buffer_size),
+    postPushWBQueueSize(params.post_push_wb_queue_size),
+    nextMemoryEvent([this] {
+        processNextMemoryEvent();
+        }, name() + ".nextMemoryEvent"),
+    nextResponseEvent([this] {
+        processNextResponseEvent();
+        }, name() + ".nextResponseEvent"),
+    nextApplyEvent([this] {
+        processNextApplyEvent();
+        }, name() + ".nextApplyEvent"),
+    nextDoneSignalEvent([this] {
+        processNextDoneSignalEvent();
+        }, name() + ".nextDoneSignalEvent"),
+    stats(*this)
+{
+    assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
+    cacheBlocks = new Block [numLines];
+    for (int i = 0; i < numLines; i++) {
+        cacheBlocks[i] = Block(numElementsPerLine);
+    }
+    activeBuffer.clear();
+    postPushWBQueue.clear();
+}
+
+void
+CoalesceEngine::registerMPU(MPU* mpu)
+{
+    owner = mpu;
+}
+
+
+// NOTE: Used for initializing memory and reading the final answer
+void
+CoalesceEngine::recvFunctional(PacketPtr pkt)
+{
+    if (pkt->isRead()) {
+        assert(pkt->getSize() == peerMemoryAtomSize);
+        Addr addr = pkt->getAddr();
+        int block_index = getBlockIndex(addr);
+
+        if ((cacheBlocks[block_index].addr == addr) &&
+            (cacheBlocks[block_index].valid)) {
+            assert(cacheBlocks[block_index].state == CacheState::IDLE);
+
+            pkt->makeResponse();
+            pkt->setDataFromBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+        } else {
+            memPort.sendFunctional(pkt);
+        }
+    } else {
+        graphWorkload->init(pkt, currentDirectory);
+        if (pkt->getAddr() > lastAtomAddr) {
+            lastAtomAddr = pkt->getAddr();
+        }
+        memPort.sendFunctional(pkt);
+    }
+}
+
+void
+CoalesceEngine::postMemInitSetup()
+{
+    currentDirectory->setLastAtomAddr(lastAtomAddr);
+}
+
+void
+CoalesceEngine::postConsumeProcess()
+{
+    Addr last_local_atom_addr = peerMemoryRange.removeIntlvBits(lastAtomAddr);
+    for (Addr local_addr = 0; local_addr <= last_local_atom_addr; local_addr += peerMemoryAtomSize) {
+        Addr addr = peerMemoryRange.addIntlvBits(local_addr);
+        int block_index = getBlockIndex(addr);
+        if (cacheBlocks[block_index].addr == addr) {
+            assert(cacheBlocks[block_index].valid);
+            assert(!cacheBlocks[block_index].hasConflict);
+            assert(cacheBlocks[block_index].state == CacheState::IDLE);
+            bool atom_active_future_before = false;
+            bool atom_active_future_after = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                assert(!cacheBlocks[block_index].items[index].activeNow);
+                atom_active_future_before |= cacheBlocks[block_index].items[index].activeFuture;
+                graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]);
+                atom_active_future_after |= cacheBlocks[block_index].items[index].activeFuture;
+                if (cacheBlocks[block_index].items[index].activeFuture) {
+                    cacheBlocks[block_index].items[index].activeFuture = false;
+                    cacheBlocks[block_index].items[index].activeNow = true;
+                    cacheBlocks[block_index].dirty = true;
+                }
+            }
+            if (!atom_active_future_before && atom_active_future_after) {
+                futureActiveCacheBlocks.push_back(block_index);
+            }
+            if (atom_active_future_before && !atom_active_future_after) {
+                futureActiveCacheBlocks.erase(block_index);
+            }
+        } else {
+            WorkListItem items[numElementsPerLine];
+            PacketPtr read_pkt = createReadPacket(addr, peerMemoryAtomSize);
+            memPort.sendFunctional(read_pkt);
+            read_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            bool atom_active_future_before = false;
+            bool atom_active_future_after = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                assert(!items[index].activeNow);
+                atom_active_future_before |= items[index].activeFuture;
+                graphWorkload->interIterationInit(items[index]);
+                atom_active_future_after |= items[index].activeFuture;
+                if (items[index].activeFuture) {
+                    items[index].activeFuture = false;
+                    items[index].activeNow = true;
+                }
+            }
+            if (!atom_active_future_before && atom_active_future_after) {
+                futureDirectory->activate(addr);
+            }
+            if (atom_active_future_before && !atom_active_future_after) {
+                futureDirectory->deactivate(addr);
+            }
+            PacketPtr write_pkt = createWritePacket(addr, peerMemoryAtomSize, (uint8_t*) items);
+            memPort.sendFunctional(write_pkt);
+            delete read_pkt;
+            delete write_pkt;
+        }
+    }
+}
+
+void
+CoalesceEngine::createAsyncPopCountDirectory(int atoms_per_block)
+{
+    currentDirectory = new PopCountDirectory(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+    futureDirectory = nullptr;
+}
+
+void
+CoalesceEngine::createBSPPopCountDirectory(int atoms_per_block)
+{
+    currentDirectory = new PopCountDirectory(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+    futureDirectory = new PopCountDirectory(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+}
+
+void
+CoalesceEngine::swapDirectories()
+{
+    assert(currentDirectory->empty());
+    assert(currentActiveCacheBlocks.empty());
+    // assert currentDirectory is empty
+    WorkDirectory* temp = currentDirectory;
+    currentDirectory = futureDirectory;
+    futureDirectory = temp;
+
+    currentActiveCacheBlocks.clear();
+    currentActiveCacheBlocks = futureActiveCacheBlocks;
+    futureActiveCacheBlocks.clear();
+}
+
+bool
+CoalesceEngine::done()
+{
+    return memoryFunctionQueue.empty() && currentActiveCacheBlocks.empty() &&
+        activeBuffer.empty() && currentDirectory->empty() && (onTheFlyReqs == 0);
+}
+
+bool
+CoalesceEngine::enoughSpace()
+{
+    return (activeBuffer.size() + pendingPullReads + pullsScheduled) < activeBufferSize;
+}
+
+bool
+CoalesceEngine::pullCondition()
+{
+    bool enough_space = enoughSpace();
+    bool schedule_limit = pullsScheduled < pendingPullLimit;
+    return enough_space && schedule_limit;
+}
+
+// addr should be aligned to peerMemoryAtomSize
+int
+CoalesceEngine::getBlockIndex(Addr addr)
+{
+    assert((addr % peerMemoryAtomSize) == 0);
+    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
+    return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
+}
+
+
+
+ReadReturnStatus
+CoalesceEngine::recvWLRead(Addr addr)
+{
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
+    assert(aligned_addr % peerMemoryAtomSize == 0);
+    int block_index = getBlockIndex(aligned_addr);
+    assert(block_index < numLines);
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+    assert(wl_offset < numElementsPerLine);
+    //assert(addr in a right slice)
+    // assert((cacheBlocks[block_index].addr == aligned_addr))
+    DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
+                        "This request maps to cacheBlocks[%d], aligned_addr: "
+                        "%lu, and wl_offset: %d.\n", __func__, addr,
+                        block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
+
+    if ((cacheBlocks[block_index].addr == aligned_addr) and 
+            (cacheBlocks[block_index].valid)) {
+        // Hit
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
+        stats.readHits++;
+        assert(cacheBlocks[block_index].state != CacheState::INVALID);
+        responseQueue.push_back(std::make_tuple(
+            addr, cacheBlocks[block_index].items[wl_offset], curTick()));
+
+        DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                        cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
+        DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
+        // TODO: Stat to count the number of WLItems that have been touched.
+        cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+        cacheBlocks[block_index].state = CacheState::BUSY;
+        // HACK: If a read happens on the same cycle as another operation such
+        // as apply set lastChangedTick to half a cycle later so that operation
+        // scheduled by the original operation (apply in this example) are
+        // invalidated. For more details refer to "accl/graph/sega/busyMaskErr"
+        cacheBlocks[block_index].lastChangedTick =
+                                    curTick() + (Tick) (clockPeriod() / 2);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+
+        if (!nextResponseEvent.scheduled()) {
+            schedule(nextResponseEvent, nextCycle());
+        }
+        stats.numVertexReads++;
+        return ReadReturnStatus::ACCEPT;
+    } else if (cacheBlocks[block_index].state == CacheState::PENDING_DATA) {
+        // Hit under miss
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit under miss.\n",
+                                                        __func__, addr);
+        stats.readHitUnderMisses++;
+        assert(!cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(!cacheBlocks[block_index].dirty);
+
+        assert(MSHR.find(block_index) != MSHR.end());
+        MSHR[block_index].push_back(addr);
+        DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to MSHR "
+                "for cacheBlocks[%d].\n", __func__, addr, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+        stats.numVertexReads++;
+        return ReadReturnStatus::ACCEPT;
+    } else {
+    //     // miss
+        assert(cacheBlocks[block_index].addr != aligned_addr);
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a cold miss.\n", 
+                                                                __func__, addr);
+        stats.readMisses++;
+            // cold miss
+            assert(MSHR.find(block_index) == MSHR.end());
+            cacheBlocks[block_index].addr = aligned_addr;
+            cacheBlocks[block_index].busyMask = 0;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].dirty = false;
+            cacheBlocks[block_index].hasConflict = false;
+            cacheBlocks[block_index].state = CacheState::PENDING_DATA;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+
+            MSHR[block_index].push_back(addr);
+            memoryFunctionQueue.emplace_back(
+                [this] (int block_index, Tick schedule_tick) {
+                    processNextRead(block_index, schedule_tick);
+                }, block_index, curTick());
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
+            }
+            return ReadReturnStatus::ACCEPT;
+        }
+    }
+}
+
+bool
+CoalesceEngine::handleMemResp(PacketPtr pkt)
+{
+    assert(pkt->isResponse());
+    DPRINTF(CoalesceEngine,  "%s: Received packet: %s from memory.\n",
+                                                __func__, pkt->print());
+
+    onTheFlyReqs--;
+    if (pkt->isWrite()) {
+        DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__);
+        delete pkt;
+    } else {
+        assert(pkt->isRead());
+        Addr addr = pkt->getAddr();
+        int block_index = getBlockIndex(addr);
+        ReadPurpose* purpose = pkt->findNextSenderState<ReadPurpose>();
+
+        // NOTE: Regardless of where the pkt will go we have to release the
+        // reserved space for this pkt in the activeBuffer in case
+        // it was read from memory for placement in the activeBuffer.
+        // NOTE: Also we have to stop tracking the address for pullAddrs
+        if (purpose->dest() == ReadDestination::READ_FOR_PUSH) {
+            pendingPullReads--;
+            pendingPullAddrs.erase(addr);
+        }
+        if (cacheBlocks[block_index].addr == addr) {
+            // If it is in the cache, line should be in PENDING_DATA state.
+            // Regardless of the purpose for which it was read, it should
+            // be placed in the cache array.
+            assert(cacheBlocks[block_index].busyMask == 0);
+            assert(!cacheBlocks[block_index].valid);
+            assert(!cacheBlocks[block_index].dirty);
+            assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA);
+
+            // NOTE: Since it is in PENDING_DATA state it
+            // should have an entry in the MSHR.
+            assert(MSHR.find(block_index) != MSHR.end());
+
+            pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
+                                                            peerMemoryAtomSize);
+
+            cacheBlocks[block_index].valid = true;
+            // HACK: In case the pkt was read for push but it was allocated
+            // for in the cache later on, we should cancel the future
+            // processNextRead for this block. We could set lastChangedTick
+            // to curTick() like usual. However, there is no way to ensure
+            // that processNextRead will be not be called on the same tick
+            // as the pkt arrives from the memory. Therefore, we will set
+            // the lastChangedTick to half a cycle before the actual time.
+            // We move that back in time because it would be fine if
+            // processNextRead happened before pkt arriveed. processNextRead
+            // actually will check if there is a pending read for push for
+            // the address it's trying to populate.
+            if (purpose->dest() == ReadDestination::READ_FOR_PUSH) {
+                cacheBlocks[block_index].lastChangedTick =
+                                    curTick() - (Tick) (clockPeriod() / 2);
+            } else {
+                cacheBlocks[block_index].lastChangedTick = curTick();
+            }
+
+            // NOTE: If the atom is active we have to deactivate the tracking
+            // of this atom in the memory since it's not in memory anymore.
+            // Since it is going to the cache, cache will be responsible for
+            // tracking this. Push to activeCacheBlocks for simulator speed
+            // instead of having to search for active blocks in the cache.
+            bool atom_active_now = false;
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_now) {
+                int count = currentDirectory->deactivate(addr);
+                currentActiveCacheBlocks.push_back(block_index);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.currentBlockActiveCount.sample(count);
+            }
+            if (atom_active_future) {
+                int count = futureDirectory->deactivate(addr);
+                futureActiveCacheBlocks.push_back(block_index);
+                stats.futureFrontierSize.sample(futureDirectory->workCount());
+                stats.futureBlockActiveCount.sample(count);
+            }
+
+            assert(MSHR.find(block_index) != MSHR.end());
+            for (auto it = MSHR[block_index].begin();
+                                            it != MSHR[block_index].end();) {
+                Addr miss_addr = *it;
+                Addr aligned_miss_addr =
+                            roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+
+                assert(aligned_miss_addr == cacheBlocks[block_index].addr);
+                int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+                DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                            "cacheBlocks[%d] can be serviced with the received "
+                            "packet.\n",__func__, miss_addr, block_index);
+                responseQueue.push_back(std::make_tuple(miss_addr,
+                        cacheBlocks[block_index].items[wl_offset], curTick()));
+                DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, miss_addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+                DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                            block_index, cacheBlocks[block_index].to_string());
+                it = MSHR[block_index].erase(it);
+            }
+            MSHR.erase(block_index);
+
+            cacheBlocks[block_index].state = CacheState::BUSY;
+            if ((!nextResponseEvent.scheduled()) && (!responseQueue.empty())) {
+                schedule(nextResponseEvent, nextCycle());
+            }
+            delete pkt;
+        } else {
+            assert(purpose->dest() == ReadDestination::READ_FOR_PUSH);
+            // There should be enough room in activeBuffer to place this pkt.
+            // REMEMBER: If dest == READ_FOR_PUSH we release the reserved space.
+            // So at this point in code we should have at least one free entry
+            // in the active buffer which is reserved for this pkt.
+            assert(activeBuffer.size() + pendingPullReads < activeBufferSize);
+
+            WorkListItem items[numElementsPerLine];
+            pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            bool atom_active_now = false;
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                atom_active_now |= items[index].activeNow;
+                atom_active_future |= items[index].activeFuture;
+            }
+            if (atom_active_now) {
+                int count = currentDirectory->deactivate(addr);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.currentBlockActiveCount.sample(count);
+                if (atom_active_future) {
+                    int count = futureDirectory->deactivate(addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.futureBlockActiveCount.sample(count);
+                }
+                activeBuffer.emplace_back(pkt, curTick());
+            } else {
+                stats.wastefulBytesRead += pkt->getSize();
+                delete pkt;
+            }
+
+            if (pullCondition()) {
+                memoryFunctionQueue.emplace_back(
+                    [this] (int ignore, Tick schedule_tick) {
+                        processNextVertexPull(ignore, schedule_tick);
+                    }, 0, curTick());
+                if ((!nextMemoryEvent.pending()) &&
+                    (!nextMemoryEvent.scheduled())) {
+                    schedule(nextMemoryEvent, nextCycle());
+                }
+                pullsScheduled++;
+            }
+        }
+    }
+
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
+    }
+    return true;
+}
+
+void
+CoalesceEngine::processNextResponseEvent()
+{
+    int num_responses_sent = 0;
+
+    Addr addr_response;
+    WorkListItem worklist_response;
+    Tick response_queueing_tick;
+    while(true) {
+        std::tie(addr_response, worklist_response, response_queueing_tick) =
+                                                        responseQueue.front();
+        Tick waiting_ticks = curTick() - response_queueing_tick;
+        if (ticksToCycles(waiting_ticks) < 1) {
+            break;
+        }
+        owner->handleIncomingWL(addr_response, worklist_response);
+        num_responses_sent++;
+        DPRINTF(CoalesceEngine,
+                    "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
+                    __func__,
+                    graphWorkload->printWorkListItem(worklist_response),
+                    addr_response);
+
+        responseQueue.pop_front();
+        DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue."
+                    " responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
+                    "responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        stats.responseQueueLatency.sample(
+                                    waiting_ticks * 1e9 / getClockFrequency());
+        if (num_responses_sent >= maxRespPerCycle) {
+            // TODO: Add the condition to check that front of queue can be
+            // sent to WLEngine. i.e. it has at least been in the queue for
+            // one cycle.
+            if (!responseQueue.empty()) {
+                stats.responsePortShortage++;
+            }
+            break;
+        }
+        if (responseQueue.empty()) {
+            break;
+        }
+    }
+
+    if ((!nextResponseEvent.scheduled()) &&
+        (!responseQueue.empty())) {
+        schedule(nextResponseEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
+{
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
+    int block_index = getBlockIndex(aligned_addr);
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+    DPRINTF(CoalesceEngine,  "%s: Received a write request for addr: %lu with "
+                        "wl: %s. This request maps to cacheBlocks[%d], "
+                        "aligned_addr: %lu, and wl_offset: %d.\n",
+                        __func__, addr, graphWorkload->printWorkListItem(wl),
+                        block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
+                "with Addr: %lu.\n", __func__,
+                graphWorkload->printWorkListItem(wl), addr);
+
+    // NOTE: Design does not allow for write misses.
+    assert(cacheBlocks[block_index].addr == aligned_addr);
+    // cache state asserts
+    assert(cacheBlocks[block_index].busyMask != 0);
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].state == CacheState::BUSY);
+
+    // respective bit in busyMask for wl is set.
+    assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
+            (1 << wl_offset));
+
+    if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) {
+        cacheBlocks[block_index].dirty |= true;
+    }
+
+    bool active = graphWorkload->activeCondition(wl, cacheBlocks[block_index].items[wl_offset]);
+    cacheBlocks[block_index].items[wl_offset] = wl;
+    if (mode == ProcessingMode::ASYNCHRONOUS) {
+        cacheBlocks[block_index].items[wl_offset].activeNow |= active;
+        if (active && (!currentActiveCacheBlocks.find(block_index))) {
+            currentActiveCacheBlocks.push_back(block_index);
+            if (!owner->running()) {
+                owner->start();
+            }
+        }
+    }
+    if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        cacheBlocks[block_index].items[wl_offset].activeFuture |= active;
+        if (active && (!futureActiveCacheBlocks.find(block_index))) {
+            futureActiveCacheBlocks.push_back(block_index);
+        }
+    }
+
+    cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
+    cacheBlocks[block_index].lastChangedTick = curTick();
+    DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
+                __func__, block_index, wl_offset,
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]));
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
+
+    if (cacheBlocks[block_index].busyMask == 0) {
+        if (cacheBlocks[block_index].hasConflict) {
+            if (cacheBlocks[block_index].dirty) {
+                cacheBlocks[block_index].state = CacheState::PENDING_WB;
+                cacheBlocks[block_index].lastChangedTick = curTick();
+                memoryFunctionQueue.emplace_back(
+                    [this] (int block_index, Tick schedule_tick) {
+                        processNextWriteBack(block_index, schedule_tick);
+                    }, block_index, curTick());
+                if ((!nextMemoryEvent.pending()) &&
+                    (!nextMemoryEvent.scheduled())) {
+                    schedule(nextMemoryEvent, nextCycle());
+                }
+            } else {
+                bool atom_active_now = false;
+                bool atom_active_future = false;
+                for (int index = 0; index < numElementsPerLine; index++) {
+                    atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                    atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+                }
+                if (atom_active_now) {
+                    currentActiveCacheBlocks.erase(block_index);
+                    int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                    stats.currentFrontierSize.sample(currentDirectory->workCount());
+                    stats.currentBlockActiveCount.sample(count);
+                }
+                if (atom_active_future) {
+                    futureActiveCacheBlocks.erase(block_index);
+                    int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.futureBlockActiveCount.sample(count);
+                }
+                cacheBlocks[block_index].reset();
+            }
+        } else {
+            cacheBlocks[block_index].state = CacheState::IDLE;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+        }
+    }
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    stats.numVertexWrites++;
+
+    if ((cacheBlocks[block_index].state == CacheState::IDLE) &&
+        done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextMemoryEvent()
+{
+    if (memPort.blocked()) {
+        stats.numMemoryBlocks++;
+        nextMemoryEvent.sleep();
+        return;
+    }
+
+    DPRINTF(CoalesceEngine, "%s: Processing another "
+                        "memory function.\n", __func__);
+    std::function<void(int, Tick)> next_memory_function;
+    int next_memory_function_input;
+    Tick next_memory_function_tick;
+    std::tie(
+        next_memory_function,
+        next_memory_function_input,
+        next_memory_function_tick) = memoryFunctionQueue.front();
+    next_memory_function(next_memory_function_input, next_memory_function_tick);
+    memoryFunctionQueue.pop_front();
+    stats.memoryFunctionLatency.sample((curTick() - next_memory_function_tick)
+                                                * 1e9 / getClockFrequency());
+    DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
+                                "memoryFunctionQueue.size = %d.\n", __func__,
+                                memoryFunctionQueue.size());
+
+    assert(!nextMemoryEvent.pending());
+    assert(!nextMemoryEvent.scheduled());
+    if ((!memoryFunctionQueue.empty())) {
+        schedule(nextMemoryEvent, nextCycle());
+    }
+
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
+{
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n",
+                                            __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+        __func__, block_index, cacheBlocks[block_index].to_string());
+    // A cache block should not be touched while it's waiting for data.
+    // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
+    // TODO: Figure out if this is still necessary.
+    if (cacheBlocks[block_index].lastChangedTick != schedule_tick) {
+        return;
+    }
+
+    assert(cacheBlocks[block_index].busyMask == 0);
+    assert(!cacheBlocks[block_index].valid);
+    assert(!cacheBlocks[block_index].dirty);
+    assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA);
+
+    bool need_send_pkt = true;
+
+    // NOTE: Search postPushWBQueue
+    for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end();)
+    {
+        PacketPtr wb_pkt = std::get<0>(*wb);
+        if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) {
+            wb_pkt->writeDataToBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+            cacheBlocks[block_index].valid = true;
+            cacheBlocks[block_index].dirty = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            // NOTE: If an atom is in the postPushWBQueue,
+            // the it is definitely currently not active.
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++)
+            {
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                futureActiveCacheBlocks.push_back(block_index);
+            }
+
+            need_send_pkt = false;
+            wb = postPushWBQueue.erase(wb);
+            delete wb_pkt;
+        } else {
+            wb++;
+        }
+    }
+    // NOTE: Search activeBuffer
+    for (auto ab = activeBuffer.begin(); ab != activeBuffer.end();) {
+        PacketPtr ab_pkt = std::get<0>(*ab);
+        if (cacheBlocks[block_index].addr == ab_pkt->getAddr()) {
+            ab_pkt->writeDataToBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+
+            cacheBlocks[block_index].valid = true;
+            cacheBlocks[block_index].dirty = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            // If an atom is in the activeBuffer,
+            // then it is definitely currently active.
+            currentActiveCacheBlocks.push_back(block_index);
+            // NOTE: Residence in the activeBuffer does not
+            // signify anything about future activity.
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++)
+            {
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                futureActiveCacheBlocks.push_back(block_index);
+            }
+
+            need_send_pkt = false;
+            ab = activeBuffer.erase(ab);
+            delete ab_pkt;
+            if (pullCondition()) {
+                memoryFunctionQueue.emplace_back(
+                    [this] (int ignore, Tick schedule_tick) {
+                        processNextVertexPull(ignore, schedule_tick);
+                    }, 0, curTick());
+                pullsScheduled++;
+            }
+        } else {
+            ab++;
+        }
+    }
+    if (!need_send_pkt) {
+        for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+            Addr miss_addr = *it;
+            Addr aligned_miss_addr =
+                roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+            assert(aligned_miss_addr == cacheBlocks[block_index].addr);
+            int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                        "cacheBlocks[%d] can be serviced with the received "
+                        "packet.\n",__func__, miss_addr, block_index);
+            // TODO: Make this block of code into a function
+            responseQueue.push_back(std::make_tuple(miss_addr,
+                    cacheBlocks[block_index].items[wl_offset], curTick()));
+            DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, miss_addr,
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
+                        responseQueue.size());
+            DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, miss_addr,
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
+                        responseQueue.size());
+            cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+                    __func__, block_index,
+                    cacheBlocks[block_index].to_string());
+            it = MSHR[block_index].erase(it);
+        }
+        assert(MSHR[block_index].empty());
+        MSHR.erase(block_index);
+        if ((!nextResponseEvent.scheduled()) &&
+            (!responseQueue.empty())) {
+            schedule(nextResponseEvent, nextCycle());
+        }
+        cacheBlocks[block_index].state = CacheState::BUSY;
+    }
+
+    if (pendingPullAddrs.find(cacheBlocks[block_index].addr) !=
+                                            pendingPullAddrs.end()) {
+        need_send_pkt = false;
+    }
+
+    if (need_send_pkt) {
+        PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
+                                        peerMemoryAtomSize);
+        ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_CACHE);
+        pkt->pushSenderState(purpose);
+        DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
+                "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
+        memPort.sendPacket(pkt);
+        onTheFlyReqs++;
+    }
+}
+
+void
+CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
+{
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n",
+                                                __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+
+    if (schedule_tick == cacheBlocks[block_index].lastChangedTick) {
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].dirty);
+        assert(cacheBlocks[block_index].hasConflict);
+        assert(cacheBlocks[block_index].state == CacheState::PENDING_WB);
+
+        // NOTE: If the atom we're writing back is active, we have to
+        // stop tracking it in the cache and start tracking it in the memory.
+        bool atom_active_now = false;
+        bool atom_active_future = false;
+        for (int index = 0; index < numElementsPerLine; index++) {
+            atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+            atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+        }
+
+        PacketPtr pkt = createWritePacket(
+                cacheBlocks[block_index].addr, peerMemoryAtomSize,
+                (uint8_t*) cacheBlocks[block_index].items);
+        DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
+                        "Addr: %lu, size = %d.\n", __func__,
+                        pkt->getAddr(), pkt->getSize());
+        if (atom_active_future) {
+            futureActiveCacheBlocks.erase(block_index);
+        }
+        if (atom_active_now) {
+            currentActiveCacheBlocks.erase(block_index);
+            if (enoughSpace()) {
+                activeBuffer.emplace_back(pkt, curTick());
+            } else {
+                int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.currentBlockActiveCount.sample(count);
+                if (atom_active_future) {
+                    int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.futureBlockActiveCount.sample(count);
+                }
+                memPort.sendPacket(pkt);
+                onTheFlyReqs++;
+            }
+        } else {
+            if (atom_active_future) {
+                int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                stats.futureFrontierSize.sample(futureDirectory->workCount());
+                stats.futureBlockActiveCount.sample(count);
+            }
+            memPort.sendPacket(pkt);
+            onTheFlyReqs++;
+        }
+        cacheBlocks[block_index].reset();
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+    } else {
+        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a "
+                            "write back has been scheduled for it. Ignoring "
+                            "the current write back scheduled at tick %lu for "
+                            "the right function scheduled later.\n",
+                            __func__, block_index, schedule_tick);
+    }
+}
+
+void
+CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
+{
+    if (!postPushWBQueue.empty()) {
+        PacketPtr wb_pkt;
+        Tick pkt_tick;
+        std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
+        if (schedule_tick == pkt_tick) {
+            WorkListItem items[numElementsPerLine];
+            wb_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                atom_active_future |= items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                futureDirectory->activate(wb_pkt->getAddr());
+            }
+            memPort.sendPacket(wb_pkt);
+            onTheFlyReqs++;
+            postPushWBQueue.pop_front();
+        }
+    }
+}
+
+void
+CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
+{
+    pullsScheduled--;
+    if (!currentDirectory->empty()) {
+        Addr addr = currentDirectory->getNextWork();
+        int block_index = getBlockIndex(addr);
+
+        bool in_cache = cacheBlocks[block_index].addr == addr;
+        bool in_active_buffer = false;
+        for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) {
+            PacketPtr pkt = std::get<0>(*ab);
+            in_active_buffer |= (pkt->getAddr() == addr);
+        }
+        bool in_write_buffer = false;
+        for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
+        {
+            PacketPtr pkt = std::get<0>(*wb);
+            in_write_buffer |= (pkt->getAddr() == addr);
+        }
+        bool repeat_work = pendingPullAddrs.find(addr) != pendingPullAddrs.end();
+
+        if (!in_cache && !in_active_buffer && !in_write_buffer && !repeat_work) {
+            PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
+            ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_PUSH);
+            pkt->pushSenderState(purpose);
+            memPort.sendPacket(pkt);
+            onTheFlyReqs++;
+            pendingPullReads++;
+            pendingPullAddrs.insert(addr);
+        }
+    }
+}
+
+void
+CoalesceEngine::recvMemRetry()
+{
+    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
+
+    if (!nextMemoryEvent.pending()) {
+        DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
+        return;
+    }
+    assert(!nextMemoryEvent.scheduled());
+    nextMemoryEvent.wake();
+    schedule(nextMemoryEvent, nextCycle());
+}
+
+int
+CoalesceEngine::workCount()
+{
+    return currentActiveCacheBlocks.size() + currentDirectory->workCount() + activeBuffer.size();
+}
+
+void
+CoalesceEngine::recvVertexPull()
+{
+    pullsReceived++;
+    DPRINTF(CoalesceEngine, "%s: Received a vertex pull. pullsReceived: %d.\n", __func__, pullsReceived);
+
+    stats.verticesPulled++;
+    stats.lastVertexPullTime = curTick() - stats.lastResetTick;
+    if (!nextApplyEvent.scheduled()) {
+        schedule(nextApplyEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextApplyEvent()
+{
+    if ((!activeBuffer.empty()) &&
+        (postPushWBQueue.size() < postPushWBQueueSize)) {
+        PacketPtr pkt;
+        Tick entrance_tick;
+        WorkListItem items[numElementsPerLine];
+
+        std::tie(pkt, entrance_tick) = activeBuffer.front();
+        pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+
+        for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
+            if (items[index].activeNow) {
+                Addr addr = pkt->getAddr() + index * sizeof(WorkListItem);
+                uint32_t delta = graphWorkload->apply(items[index]);
+                items[index].activeNow = false;
+                owner->recvVertexPush(addr, delta, items[index].edgeIndex,
+                                                    items[index].degree);
+                pullsReceived--;
+                stats.verticesPushed++;
+                stats.lastVertexPushTime = curTick() - stats.lastResetTick;
+            }
+        }
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
+
+        bool atom_active_now = false;
+        for (int index = 0; index < numElementsPerLine; index++) {
+            atom_active_now |= items[index].activeNow;
+        }
+        // NOTE: If the atom is not active anymore.
+        if (!atom_active_now) {
+            PacketPtr wb_pkt = createWritePacket(pkt->getAddr(),
+                                        peerMemoryAtomSize, (uint8_t*) items);
+            postPushWBQueue.emplace_back(wb_pkt, curTick());
+            activeBuffer.pop_front();
+            memoryFunctionQueue.emplace_back(
+                [this] (int ignore, Tick schedule_tick) {
+                    processNextPostPushWB(ignore, schedule_tick);
+                }, 0, curTick());
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
+            }
+            delete pkt;
+        }
+    } else if (!currentActiveCacheBlocks.empty()) {
+        int num_visited_indices = 0;
+        int initial_fifo_length = currentActiveCacheBlocks.size();
+        while (true) {
+            int block_index = currentActiveCacheBlocks.front();
+            if (cacheBlocks[block_index].state == CacheState::IDLE) {
+                for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
+                    if (cacheBlocks[block_index].items[index].activeNow) {
+                        Addr addr = cacheBlocks[block_index].addr + index * sizeof(WorkListItem);
+                        uint32_t delta = graphWorkload->apply(cacheBlocks[block_index].items[index]);
+                        cacheBlocks[block_index].items[index].activeNow = false;
+                        cacheBlocks[block_index].dirty = true;
+                        owner->recvVertexPush(addr, delta,
+                            cacheBlocks[block_index].items[index].edgeIndex,
+                            cacheBlocks[block_index].items[index].degree);
+                        pullsReceived--;
+                        stats.verticesPushed++;
+                        stats.lastVertexPushTime = curTick() - stats.lastResetTick;
+                    }
+                }
+
+                bool atom_active_now = false;
+                for (int index = 0; index < numElementsPerLine; index++) {
+                    atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                }
+                // NOTE: If we have reached the last item in the cache block
+                if (!atom_active_now) {
+                    currentActiveCacheBlocks.erase(block_index);
+                }
+                break;
+            }
+            // NOTE: If the block with index at the front of activeCacheBlocks
+            // is not in IDLE state, then roll the that index to the back
+            currentActiveCacheBlocks.pop_front();
+            currentActiveCacheBlocks.push_back(block_index);
+            // NOTE: If we have visited all the items initially in the FIFO.
+            num_visited_indices++;
+            if (num_visited_indices == initial_fifo_length) {
+                break;
+            }
+        }
+    } else {
+        DPRINTF(CoalesceEngine, "%s: Could not find work to apply.\n", __func__);
+        stats.worklessCycles++;
+    }
+
+    if (pullCondition()) {
+        memoryFunctionQueue.emplace_back(
+            [this] (int ignore, Tick schedule_tick) {
+                processNextVertexPull(ignore, schedule_tick);
+            }, 0, curTick());
+        if ((!nextMemoryEvent.pending()) &&
+            (!nextMemoryEvent.scheduled())) {
+            schedule(nextMemoryEvent, nextCycle());
+        }
+        pullsScheduled++;
+    }
+
+    if ((pullsReceived > 0) && (!nextApplyEvent.scheduled())) {
+        schedule(nextApplyEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextDoneSignalEvent()
+{
+    if (done()) {
+        owner->recvDoneSignal();
+    }
+}
+
+CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
+    : statistics::Group(&_coalesce),
+    coalesce(_coalesce),
+    lastResetTick(0),
+    ADD_STAT(numVertexReads, statistics::units::Count::get(),
+             "Number of memory vertecies read from cache."),
+    ADD_STAT(numVertexWrites, statistics::units::Count::get(),
+             "Number of memory vertecies written to cache."),
+    ADD_STAT(readHits, statistics::units::Count::get(),
+             "Number of cache hits."),
+    ADD_STAT(readMisses, statistics::units::Count::get(),
+             "Number of cache misses."),
+    ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
+             "Number of cache hit under misses."),
+    ADD_STAT(numConflicts, statistics::units::Count::get(),
+             "Number of conflicts raised by reads in the cache."),
+    ADD_STAT(responsePortShortage, statistics::units::Count::get(),
+             "Number of times a response has been "
+             "delayed because of port shortage. "),
+    ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
+             "Number of times memory bandwidth was not available."),
+    ADD_STAT(wastefulBytesRead, statistics::units::Byte::get(),
+             "Number of bytes read that were not used by coalesce engine"),
+    ADD_STAT(verticesPulled, statistics::units::Count::get(),
+             "Number of times a pull request has been sent by PushEngine."),
+    ADD_STAT(verticesPushed, statistics::units::Count::get(),
+             "Number of times a vertex has been pushed to the PushEngine"),
+    ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(),
+             "Time of the last pull request. (Relative to reset_stats)"),
+    ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
+             "Time of the last vertex push. (Relative to reset_stats)"),
+    ADD_STAT(worklessCycles, statistics::units::Count::get(),
+             "cycles the coalesce engine could not find work for apply"),
+    ADD_STAT(hitRate, statistics::units::Ratio::get(),
+             "Hit rate in the cache."),
+    ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which pull requests arrive."),
+    ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which vertices are pushed."),
+    ADD_STAT(currentFrontierSize, statistics::units::Count::get(),
+             "Histogram of the length of the current bitvector."),
+    ADD_STAT(futureFrontierSize, statistics::units::Count::get(),
+             "Histogram of the length of the future bitvector."),
+    ADD_STAT(currentBlockActiveCount, statistics::units::Count::get(),
+             "Histogram of the popCount values in the current directory"),
+    ADD_STAT(futureBlockActiveCount, statistics::units::Count::get(),
+             "Histogram of the popCount values in the future directory"),
+    ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
+             "Histogram of the response latency to WLEngine. (ns)"),
+    ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
+             "Histogram of the latency of processing a memory function.")
+{
+}
+
+void
+CoalesceEngine::CoalesceStats::regStats()
+{
+    using namespace statistics;
+
+    hitRate = (readHits + readHitUnderMisses) /
+                (readHits + readHitUnderMisses + readMisses);
+
+    vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime;
+
+    vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
+
+    currentFrontierSize.init(64);
+    futureFrontierSize.init(64);
+    currentBlockActiveCount.init(64);
+    futureBlockActiveCount.init(64);
+    responseQueueLatency.init(64);
+    memoryFunctionLatency.init(64);
+}
+
+void
+CoalesceEngine::CoalesceStats::resetStats()
+{
+    statistics::Group::resetStats();
+
+    lastResetTick = curTick();
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
new file mode 100644
index 0000000000..f7ef96197f
--- /dev/null
+++ b/src/accl/graph/sega/enums.cc
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/enums.hh"
+
+namespace gem5
+{
+
+const char* cacheStateStrings[NUM_CACHE_STATE] = {
+    "INVALID",
+    "PENDING_DATA",
+    "BUSY",
+    "IDLE",
+    "PENDING_WB"
+};
+
+const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS] =
+{
+    "ACCEPT",
+    "REJECT_ROLL",
+    "REJECT_NO_ROLL"
+};
+
+const char* readDestinationStrings[NUM_READ_DESTINATION] =
+{
+    "READ_FOR_CACHE",
+    "READ_FOR_PUSH"
+};
+
+const char* processingModeStrings[NUM_PROCESSING_MODE] =
+{
+    "NOT_SET",
+    "ASYNCHRONOUS",
+    "BULK_SYNCHRONOUS"
+};
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
new file mode 100644
index 0000000000..f97c33a0e0
--- /dev/null
+++ b/src/accl/graph/sega/enums.hh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_ENUMS_HH__
+#define __ACCL_GRAPH_SEGA_ENUMS_HH__
+
+namespace gem5
+{
+
+enum CacheState
+{
+    INVALID,
+    PENDING_DATA,
+    BUSY,
+    IDLE,
+    PENDING_WB,
+    NUM_CACHE_STATE
+};
+extern const char* cacheStateStrings[NUM_CACHE_STATE];
+
+enum ReadReturnStatus
+{
+    ACCEPT,
+    REJECT_ROLL,
+    REJECT_NO_ROLL,
+    NUM_READ_RETURN_STATUS
+};
+extern const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS];
+
+enum ReadDestination
+{
+    READ_FOR_CACHE,
+    READ_FOR_PUSH,
+    NUM_READ_DESTINATION
+};
+extern const char* readDestinationStrings[NUM_READ_DESTINATION];
+
+enum ProcessingMode
+{
+    NOT_SET,
+    ASYNCHRONOUS,
+    BULK_SYNCHRONOUS,
+    NUM_PROCESSING_MODE
+};
+extern const char* processingModeStrings[NUM_PROCESSING_MODE];
+
+} // namespace gem5
+
+#endif // __ACCL_GRAPH_SEGA_ENUMS_HH__
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
new file mode 100644
index 0000000000..318ea0798b
--- /dev/null
+++ b/src/accl/graph/sega/mpu.cc
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/mpu.hh"
+
+#include <iostream>
+
+#include "accl/graph/sega/centeral_controller.hh"
+#include "debug/MPU.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+MPU::MPU(const Params& params):
+    ClockedObject(params),
+    system(params.system),
+    wlEngine(params.wl_engine),
+    coalesceEngine(params.coalesce_engine),
+    pushEngine(params.push_engine),
+    sliceCounter(0),
+    nextSliceEvent([this] { processNextSliceEvent(); }, name())
+{
+    wlEngine->registerMPU(this);
+    coalesceEngine->registerMPU(this);
+    pushEngine->registerMPU(this);
+}
+
+void
+MPU::registerCenteralController(CenteralController* centeral_controller)
+{
+    centeralController = centeral_controller;
+}
+
+int
+MPU::getSliceSize()
+{
+    int slice_number = 
+        (coalesceEngine->getSliceSize() * centeralController->getnumGPTs());
+    
+    return slice_number;
+}
+
+bool
+MPU::bufferRemoteUpdate(int slice_number, PacketPtr pkt)
+{
+    return centeralController->bufferRemoteUpdate(slice_number, pkt);
+}
+
+bool
+MPU::handleIncomingUpdate(PacketPtr pkt)
+{
+    return wlEngine->handleIncomingUpdate(pkt);
+}
+
+void
+MPU::scheduleNewSlice()
+{
+    if (!nextSliceEvent.scheduled()) {
+        schedule(nextSliceEvent, nextCycle());
+    }
+    return;
+}
+
+void
+MPU::processNextSliceEvent()
+{ 
+    auto new_update = 
+    centeralController->remoteUpdates[this][this->getSliceCounter()].front();
+    bool sent = wlEngine->handleIncomingUpdate(new_update);
+    
+    centeralController->remoteUpdates[this]
+                                        [this->getSliceCounter()].pop_front();
+    if (!sent) {
+        centeralController->remoteUpdates[this]
+                            [this->getSliceCounter()].push_back(new_update);
+    }
+
+    if (!centeralController->remoteUpdates[this][this->getSliceCounter()].empty() && !nextSliceEvent.scheduled()) {
+        schedule(nextSliceEvent, nextCycle());
+    }
+
+}
+
+void
+MPU::handleIncomingWL(Addr addr, WorkListItem wl)
+{
+    wlEngine->handleIncomingWL(addr, wl);
+}
+
+void
+MPU::recvWLWrite(Addr addr, WorkListItem wl)
+{
+    coalesceEngine->recvWLWrite(addr, wl);
+}
+
+void
+MPU::recvWorkload(GraphWorkload* workload)
+{
+    coalesceEngine->recvWorkload(workload);
+    pushEngine->recvWorkload(workload);
+    wlEngine->recvWorkload(workload);
+}
+
+void
+MPU::recvVertexPush(Addr addr, uint32_t delta,
+                    uint32_t edge_index, uint32_t degree)
+{
+    pushEngine->recvVertexPush(addr, delta, edge_index, degree);
+}
+
+void
+MPU::recvDoneSignal()
+{
+    if (done()) {
+        centeralController->recvDoneSignal();
+    }
+}
+
+bool
+MPU::done()
+{
+    return wlEngine->done() && coalesceEngine->done() && pushEngine->done();
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
new file mode 100644
index 0000000000..2008a7dc4f
--- /dev/null
+++ b/src/accl/graph/sega/mpu.hh
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_MPU_HH__
+#define __ACCL_GRAPH_SEGA_MPU_HH__
+
+#include <unordered_map>
+#include <vector>
+
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/sega/coalesce_engine.hh"
+#include "accl/graph/sega/enums.hh"
+#include "accl/graph/sega/push_engine.hh"
+#include "accl/graph/sega/wl_engine.hh"
+#include "base/addr_range.hh"
+#include "mem/packet.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
+#include "params/MPU.hh"
+
+namespace gem5
+{
+
+class CenteralController;
+
+class MPU : public ClockedObject
+{
+  private:
+    System* system;
+    CenteralController* centeralController;
+
+    WLEngine* wlEngine;
+    CoalesceEngine* coalesceEngine;
+    PushEngine* pushEngine;
+    int sliceCounter;
+
+    EventFunctionWrapper nextSliceEvent;
+    void processNextSliceEvent();
+  public:
+    PARAMS(MPU);
+    MPU(const Params& params);
+    void registerCenteralController(CenteralController* centeral_controller);
+
+    void setProcessingMode(ProcessingMode mode) { coalesceEngine->setProcessingMode(mode); }
+    void createAsyncPopCountDirectory(int atoms_per_block) { coalesceEngine->createAsyncPopCountDirectory(atoms_per_block); }
+    void createBSPPopCountDirectory(int atoms_per_block) { coalesceEngine->createBSPPopCountDirectory(atoms_per_block); }
+
+    unsigned int vertexAtomSize() { return coalesceEngine->params().attached_memory_atom_size; }
+    AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
+    void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
+    void postMemInitSetup() { coalesceEngine->postMemInitSetup(); }
+    void postConsumeProcess() { coalesceEngine->postConsumeProcess(); }
+    void swapDirectories() { coalesceEngine->swapDirectories(); }
+
+    int getSliceSize();
+    int getSliceCounter() { return sliceCounter; }
+    int increaseSliceCounter() { return sliceCounter++; }
+    void updateSliceCounter(int value) { sliceCounter = value;}
+    void resetSliceCounter() { sliceCounter = 0; }
+    bool bufferRemoteUpdate(int slice_number, PacketPtr pkt);
+    void scheduleNewSlice();
+
+    bool handleIncomingUpdate(PacketPtr pkt);
+    void handleIncomingWL(Addr addr, WorkListItem wl);
+    ReadReturnStatus recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
+    void recvWLWrite(Addr addr, WorkListItem wl);
+    void recvWorkload(GraphWorkload* Workload);
+
+    int workCount() { return coalesceEngine->workCount(); }
+    void recvVertexPull() { return coalesceEngine->recvVertexPull(); }
+    bool running() { return pushEngine->running(); }
+    void start() { return pushEngine->start(); }
+    void recvVertexPush(Addr addr, uint32_t delta,
+                        uint32_t edge_index, uint32_t degree);
+
+    void recvDoneSignal();
+    bool done();
+};
+
+} // namespace gem5
+
+#endif // __ACCL_GRAPH_SEGA_MPU_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
new file mode 100644
index 0000000000..981b581b7c
--- /dev/null
+++ b/src/accl/graph/sega/push_engine.cc
@@ -0,0 +1,515 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/push_engine.hh"
+
+#include "accl/graph/sega/mpu.hh"
+#include "debug/PushEngine.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+PushEngine::PushEngine(const Params& params):
+    BaseMemoryEngine(params),
+    _running(false),
+    lastIdleEntranceTick(0),
+    numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
+    onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
+    maxPropagatesPerCycle(params.max_propagates_per_cycle),
+    updateQueueSize(params.update_queue_size),
+    nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()),
+    nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
+    nextPropagateEvent([this] { processNextPropagateEvent(); }, name()),
+    nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name()),
+    stats(*this)
+{
+    for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
+        outPorts.emplace_back(
+                        name() + ".out_ports" + std::to_string(i), this, i);
+    }
+}
+
+Port&
+PushEngine::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "out_ports") {
+        return outPorts[idx];
+    } else if (if_name == "mem_port") {
+        return BaseMemoryEngine::getPort(if_name, idx);
+    } else {
+        return ClockedObject::getPort(if_name, idx);
+    }
+}
+
+void
+PushEngine::init()
+{
+    localAddrRange = owner->getAddrRanges();
+    for (int i = 0; i < outPorts.size(); i++){
+        portAddrMap[outPorts[i].id()] = outPorts[i].getAddrRanges();
+    }
+}
+
+void
+PushEngine::registerMPU(MPU* mpu)
+{
+    owner = mpu;
+}
+
+void
+PushEngine::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(blockedPacket != nullptr,
+            "Should never try to send if blocked!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        DPRINTF(PushEngine, "%s: Packet is blocked.\n", __func__);
+        blockedPacket = pkt;
+    }
+}
+
+bool
+PushEngine::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+PushEngine::ReqPort::recvReqRetry()
+{
+    panic_if(blockedPacket == nullptr,
+            "Received retry without a blockedPacket.");
+
+    DPRINTF(PushEngine, "%s: ReqPort %d received a reqRetry. blockedPacket: %s.\n", __func__, _id, blockedPacket->print());
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
+    if (blockedPacket == nullptr) {
+        DPRINTF(PushEngine, "%s: blockedPacket sent successfully.\n", __func__);
+        owner->recvReqRetry();
+    }
+}
+
+void
+PushEngine::recvReqRetry()
+{
+    DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__);
+    if (!nextUpdatePushEvent.scheduled()) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+}
+
+bool
+PushEngine::vertexSpace()
+{
+    return (edgePointerQueueSize == 0) ||
+        ((edgePointerQueue.size() + numPendingPulls) < edgePointerQueueSize);
+}
+
+bool
+PushEngine::workLeft()
+{
+    return ((owner->workCount() - numPendingPulls) > 0);
+}
+
+bool
+PushEngine::done()
+{
+    bool empty_update_queues = true;
+    for (int i = 0; i < outPorts.size(); i++) {
+        empty_update_queues &= updateQueues[outPorts[i].id()].empty();
+    }
+    return empty_update_queues && metaEdgeQueue.empty() &&
+        (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
+}
+
+void
+PushEngine::start()
+{
+    assert(!_running);
+    // assert(!nextVertexPullEvent.scheduled());
+
+    _running = true;
+    // stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick);
+    // NOTE: We might have to check for size availability here.
+    assert(workLeft());
+    if (vertexSpace() && !nextVertexPullEvent.scheduled()) {
+        schedule(nextVertexPullEvent, nextCycle());
+    }
+}
+
+void
+PushEngine::processNextVertexPullEvent()
+{
+    if (workLeft()) {
+        numPendingPulls++;
+        owner->recvVertexPull();
+        if (vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+            schedule(nextVertexPullEvent, nextCycle());
+        }
+    } else {
+        _running = false;
+        lastIdleEntranceTick = curTick();
+        DPRINTF(PushEngine, "%s: In idle state now.\n", __func__);
+    }
+}
+
+void
+PushEngine::recvVertexPush(Addr addr, uint32_t delta,
+                            uint32_t edge_index, uint32_t degree)
+{
+    assert(degree > 0);
+    assert((edgePointerQueueSize == 0) ||
+            ((edgePointerQueue.size() + numPendingPulls) <= edgePointerQueueSize));
+
+    Addr start_addr = edge_index * sizeof(Edge);
+    Addr end_addr = start_addr + (degree * sizeof(Edge));
+    EdgeReadInfoGen info_gen(addr, delta, start_addr, end_addr,
+                            sizeof(Edge), peerMemoryAtomSize);
+
+    edgePointerQueue.emplace_back(info_gen, curTick());
+    stats.edgePointerQueueLength.sample(edgePointerQueue.size());
+    numPendingPulls--;
+
+    if (vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+        schedule(nextVertexPullEvent, nextCycle());
+    }
+
+    if ((!nextMemoryReadEvent.pending()) &&
+        (!nextMemoryReadEvent.scheduled())) {
+        schedule(nextMemoryReadEvent, nextCycle());
+    }
+}
+
+void
+PushEngine::processNextMemoryReadEvent()
+{
+    if (memPort.blocked()) {
+        nextMemoryReadEvent.sleep();
+        return;
+    }
+    Addr aligned_addr, offset;
+    int num_edges;
+
+    EdgeReadInfoGen& curr_info = std::get<0>(edgePointerQueue.front());
+    Tick entrance_tick = std::get<1>(edgePointerQueue.front());
+    std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
+    if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges)))
+    {
+        DPRINTF(PushEngine, "%s: Current packet information generated by "
+                    "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, "
+                    "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
+
+        PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
+        PushInfo push_info = {curr_info.src(), curr_info.delta(), offset, num_edges};
+        reqInfoMap[pkt->req] = push_info;
+        memPort.sendPacket(pkt);
+        onTheFlyMemReqs += num_edges;
+
+        curr_info.iterate();
+        if (curr_info.done()) {
+            DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__);
+            stats.edgePointerQueueLatency.sample(
+                    (curTick() - entrance_tick) * 1e9 / getClockFrequency());
+            edgePointerQueue.pop_front();
+            stats.edgePointerQueueLength.sample(edgePointerQueue.size());
+            DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. "
+            "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size());
+        }
+    }
+
+    if (vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+        schedule(nextVertexPullEvent, nextCycle());
+    }
+
+    if (!edgePointerQueue.empty()) {
+        assert(!nextMemoryReadEvent.pending());
+        assert(!nextMemoryReadEvent.scheduled());
+        schedule(nextMemoryReadEvent, nextCycle());
+    }
+}
+
+void
+PushEngine::recvMemRetry()
+{
+    if (nextMemoryReadEvent.pending()) {
+        DPRINTF(PushEngine, "%s: Received a memory retry.\n", __func__);
+        nextMemoryReadEvent.wake();
+        schedule(nextMemoryReadEvent, nextCycle());
+    }
+}
+
+bool
+PushEngine::handleMemResp(PacketPtr pkt)
+{
+    // TODO: in case we need to edit edges, get rid of second statement.
+    assert(pkt->isResponse() && (!pkt->isWrite()));
+
+    // uint8_t* pkt_data = new uint8_t [peerMemoryAtomSize];
+    // TODO: Change above line to below line.
+    uint8_t pkt_data [peerMemoryAtomSize];
+    PushInfo push_info = reqInfoMap[pkt->req];
+    pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
+
+    for (int i = 0; i < push_info.numElements; i++) {
+        Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge));
+        Addr edge_dst = edge->neighbor;
+        uint32_t edge_weight = edge->weight;
+        MetaEdge meta_edge(
+                    push_info.src, edge_dst, edge_weight, push_info.value);
+        metaEdgeQueue.emplace_back(meta_edge, curTick());
+        stats.edgeQueueLength.sample(metaEdgeQueue.size());
+    }
+    stats.numWastefulEdgesRead +=
+                (peerMemoryAtomSize / sizeof(Edge)) - push_info.numElements;
+
+    onTheFlyMemReqs -= push_info.numElements;
+    reqInfoMap.erase(pkt->req);
+    // delete [] pkt_data;
+    delete pkt;
+
+    if (!nextPropagateEvent.scheduled()) {
+        schedule(nextPropagateEvent, nextCycle());
+    }
+    return true;
+}
+
+void
+PushEngine::processNextPropagateEvent()
+{
+    int num_propagates = 0;
+    while(true) {
+        MetaEdge meta_edge;
+        Tick entrance_tick;
+        std::tie(meta_edge, entrance_tick) = metaEdgeQueue.front();
+
+        DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
+                                __func__, meta_edge.to_string());
+
+        uint32_t update_value =
+                graphWorkload->propagate(meta_edge.value, meta_edge.weight);
+        Update update(meta_edge.src, meta_edge.dst, update_value);
+        metaEdgeQueue.pop_front();
+
+        if (enqueueUpdate(update)) {
+            DPRINTF(PushEngine, "%s: Sent %s to port queues.\n",
+                                            __func__, meta_edge.to_string());
+            stats.numPropagates++;
+            stats.edgeQueueLatency.sample(
+                    (curTick() - entrance_tick) * 1e9 / getClockFrequency());
+            stats.edgeQueueLength.sample(metaEdgeQueue.size());
+        } else {
+            metaEdgeQueue.emplace_back(meta_edge, entrance_tick);
+        }
+        num_propagates++;
+
+        if (metaEdgeQueue.empty()) {
+            break;
+        }
+        if (num_propagates >= maxPropagatesPerCycle) {
+            break;
+        }
+    }
+
+    stats.numPropagatesHist.sample(num_propagates);
+
+    assert(!nextPropagateEvent.scheduled());
+    if (!metaEdgeQueue.empty()) {
+        schedule(nextPropagateEvent, nextCycle());
+    }
+}
+
+bool
+PushEngine::enqueueUpdate(Update update)
+{
+    Addr dst_addr = update.dst;
+    bool found_coalescing = false;
+    bool found_locally = false;
+    bool accepted = false;
+    for (auto range : localAddrRange) {
+        found_locally |= range.contains(dst_addr);
+    }
+    DPRINTF(PushEngine, "%s: Received update: %s.\n", __func__, update.to_string());
+    for (int i = 0; i < outPorts.size(); i++) {
+        AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()];
+        if (contains(addr_range_list, dst_addr)) {
+            DPRINTF(PushEngine, "%s: Update: %s belongs to port %d.\n",
+                        __func__, update.to_string(), outPorts[i].id());
+            DPRINTF(PushEngine, "%s: There are %d updates already "
+                        "in queue for port %d.\n", __func__,
+                        updateQueues[outPorts[i].id()].size(),
+                        outPorts[i].id());
+            for (auto& entry: updateQueues[outPorts[i].id()]) {
+                Update& curr_update = std::get<0>(entry);
+                if (curr_update.dst == update.dst) {
+                    uint32_t old_value = curr_update.value;
+                    curr_update.value = graphWorkload->reduce(old_value, update.value);
+                    DPRINTF(PushEngine, "%s: found a coalescing opportunity "
+                            "for destination %d with new value: %d by "
+                            "coalescing %d and %d. \n", __func__, update.dst,
+                            curr_update.value, old_value, update.value);
+                    found_coalescing = true;
+                    accepted = true;
+                    stats.updateQueueCoalescions++;
+                }
+            }
+            if ((found_coalescing == false) &&
+                (updateQueues[outPorts[i].id()].size() < updateQueueSize)) {
+                DPRINTF(PushEngine, "%s: There is a free entry available "
+                            "in queue %d.\n", __func__, outPorts[i].id());
+                updateQueues[outPorts[i].id()].emplace_back(update, curTick());
+                DPRINTF(PushEngine, "%s: Emplaced the update at the back "
+                            "of queue for port %d is. Size of queue "
+                            "for port %d is %d.\n", __func__,
+                            outPorts[i].id(), outPorts[i].id(),
+                            updateQueues[outPorts[i].id()].size());
+                accepted = true;
+                stats.updateQueueLength.sample(
+                                        updateQueues[outPorts[i].id()].size());
+            }
+        }
+    }
+
+    if (accepted && (!nextUpdatePushEvent.scheduled())) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+
+    return accepted;
+}
+
+template<typename T> PacketPtr
+PushEngine::createUpdatePacket(Addr addr, T value)
+{
+    RequestPtr req = std::make_shared<Request>(addr, sizeof(T), 0, 0);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) 1) << 2);
+
+    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
+
+    pkt->allocate();
+    // pkt->setData(data);
+    pkt->setLE<T>(value);
+
+    return pkt;
+}
+
+void
+PushEngine::processNextUpdatePushEvent()
+{
+    int next_time_send = 0;
+
+    for (int i = 0; i < outPorts.size(); i++) {
+        if (outPorts[i].blocked()) {
+            DPRINTF(PushEngine, "%s: Port %d blocked.\n",
+                                __func__, outPorts[i].id());
+            continue;
+        }
+        DPRINTF(PushEngine, "%s: Port %d available.\n",
+                                __func__, outPorts[i].id());
+        if (updateQueues[outPorts[i].id()].empty()) {
+            DPRINTF(PushEngine, "%s: Respective queue for port "
+                        "%d is empty.\n", __func__, outPorts[i].id());
+            continue;
+        }
+        DPRINTF(PushEngine, "%s: Respective queue for port "
+                        "%d not empty.\n", __func__, outPorts[i].id());
+        Update update;
+        Tick entrance_tick;
+        std::tie(update, entrance_tick) = updateQueues[outPorts[i].id()].front();
+        PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
+        outPorts[i].sendPacket(pkt);
+        DPRINTF(PushEngine, "%s: Sent update: %s to port %d. "
+                    "Respective queue size is %d.\n", __func__,
+                    update.to_string(), outPorts[i].id(),
+                    updateQueues[outPorts[i].id()].size());
+        updateQueues[outPorts[i].id()].pop_front();
+        if (updateQueues[outPorts[i].id()].size() > 0) {
+            next_time_send += 1;
+        }
+        stats.numUpdates++;
+    }
+
+    assert(!nextUpdatePushEvent.scheduled());
+    if (next_time_send > 0) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+}
+
+PushEngine::PushStats::PushStats(PushEngine &_push)
+    : statistics::Group(&_push),
+    push(_push),
+    ADD_STAT(numPropagates, statistics::units::Count::get(),
+             "Number of propagate operations done."),
+    ADD_STAT(numNetBlocks, statistics::units::Count::get(),
+             "Number of updates blocked by network."),
+    // ADD_STAT(numIdleCycles, statistics::units::Count::get(),
+    //          "Number of cycles PushEngine has been idle."),
+    ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(),
+             "Number of coalescions in the update queues."),
+    ADD_STAT(numUpdates, statistics::units::Count::get(),
+             "Number of updates sent to the network."),
+    ADD_STAT(numWastefulEdgesRead, statistics::units::Count::get(),
+             "Number of wasteful edges read from edge memory."),
+    ADD_STAT(TEPS, statistics::units::Rate<statistics::units::Count,
+                                    statistics::units::Second>::get(),
+             "Traversed Edges Per Second."),
+    ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(),
+             "Histogram of the latency of the edgePointerQueue."),
+    ADD_STAT(edgePointerQueueLength, statistics::units::Count::get(),
+             "Histogram of the size of the edgePointerQueue."),
+    ADD_STAT(edgeQueueLatency, statistics::units::Second::get(),
+             "Histogram of the latency of the metaEdgeQueue."),
+    ADD_STAT(edgeQueueLength, statistics::units::Count::get(),
+             "Histogram of the size of the metaEdgeQueue."),
+    ADD_STAT(updateQueueLength, statistics::units::Count::get(),
+             "Histogram of the length of updateQueues."),
+    ADD_STAT(numPropagatesHist, statistics::units::Count::get(),
+             "Histogram of number of propagates sent.")
+{
+}
+
+void
+PushEngine::PushStats::regStats()
+{
+    using namespace statistics;
+
+    TEPS = numPropagates / simSeconds;
+
+    edgePointerQueueLatency.init(64);
+    edgePointerQueueLength.init(64);
+    edgeQueueLatency.init(64);
+    edgeQueueLength.init(64);
+    updateQueueLength.init(64);
+    numPropagatesHist.init(push.params().max_propagates_per_cycle);
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
new file mode 100644
index 0000000000..f51865acb3
--- /dev/null
+++ b/src/accl/graph/sega/push_engine.hh
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
+
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
+#include "accl/graph/sega/enums.hh"
+#include "base/intmath.hh"
+#include "params/PushEngine.hh"
+
+namespace gem5
+{
+
+class MPU;
+
+class PushEngine : public BaseMemoryEngine
+{
+  private:
+    class ReqPort : public RequestPort
+    {
+      private:
+        PushEngine* owner;
+        PacketPtr blockedPacket;
+        PortID _id;
+
+      public:
+        ReqPort(const std::string& name, PushEngine* owner, PortID id) :
+          RequestPort(name, owner),
+          owner(owner), blockedPacket(nullptr), _id(id)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return (blockedPacket != nullptr); }
+        PortID id() { return _id; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    class EdgeReadInfoGen {
+      private:
+        Addr _src;
+        uint32_t _delta;
+
+        Addr _start;
+        Addr _end;
+        size_t _step;
+        size_t _atom;
+
+      public:
+        EdgeReadInfoGen(Addr src, uint32_t delta, Addr start,
+                        Addr end, size_t step, size_t atom):
+                        _src(src), _delta(delta), _start(start),
+                        _end(end), _step(step), _atom(atom)
+        {}
+
+        Addr src() { return _src; }
+        uint32_t delta() { return _delta; }
+
+        std::tuple<Addr, Addr, int> nextReadPacketInfo()
+        {
+            panic_if(done(), "Should not call nextPacketInfo when done.\n");
+            Addr aligned_addr = roundDown<Addr, Addr>(_start, _atom);
+            Addr offset = _start - aligned_addr;
+            int num_items = 0;
+
+            if (_end > (aligned_addr + _atom)) {
+                num_items = (_atom - offset) / _step;
+            } else {
+                num_items = (_end - _start) / _step;
+            }
+
+            return std::make_tuple(aligned_addr, offset, num_items);
+        }
+
+        void iterate()
+        {
+            panic_if(done(), "Should not call iterate when done.\n");
+            Addr aligned_addr = roundDown<Addr, Addr>(_start, _atom);
+            _start = aligned_addr + _atom;
+        }
+
+        bool done() { return (_start >= _end); }
+    };
+    struct PushInfo {
+        Addr src;
+        uint32_t value;
+        Addr offset;
+        int numElements;
+    };
+    MPU* owner;
+    GraphWorkload* graphWorkload;
+
+    bool _running;
+    Tick lastIdleEntranceTick;
+
+    AddrRangeList localAddrRange;
+
+    int numPendingPulls;
+    int edgePointerQueueSize;
+    std::deque<std::tuple<EdgeReadInfoGen, Tick>> edgePointerQueue;
+    std::unordered_map<RequestPtr, PushInfo> reqInfoMap;
+
+    int onTheFlyMemReqs;
+    int edgeQueueSize;
+    int maxPropagatesPerCycle;
+    std::deque<std::tuple<MetaEdge, Tick>> metaEdgeQueue;
+
+    int updateQueueSize;
+    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
+    bool enqueueUpdate(Update update);
+    std::unordered_map<PortID, AddrRangeList> portAddrMap;
+    std::unordered_map<PortID, std::deque<std::tuple<Update, Tick>>> updateQueues;
+    std::vector<ReqPort> outPorts;
+
+    bool vertexSpace();
+    bool workLeft();
+
+    EventFunctionWrapper nextVertexPullEvent;
+    void processNextVertexPullEvent();
+
+    MemoryEvent nextMemoryReadEvent;
+    void processNextMemoryReadEvent();
+
+    EventFunctionWrapper nextPropagateEvent;
+    void processNextPropagateEvent();
+
+    EventFunctionWrapper nextUpdatePushEvent;
+    void processNextUpdatePushEvent();
+
+    struct PushStats : public statistics::Group
+    {
+      PushStats(PushEngine &push);
+
+      void regStats() override;
+
+      PushEngine &push;
+
+      statistics::Scalar numMemoryBlocks;
+      statistics::Scalar numPropagates;
+      statistics::Scalar numNetBlocks;
+    //   statistics::Scalar numIdleCycles;
+      statistics::Scalar updateQueueCoalescions;
+      statistics::Scalar numUpdates;
+      statistics::Scalar numWastefulEdgesRead;
+
+      statistics::Formula TEPS;
+
+      statistics::Histogram edgePointerQueueLatency;
+      statistics::Histogram edgePointerQueueLength;
+      statistics::Histogram edgeQueueLatency;
+      statistics::Histogram edgeQueueLength;
+      statistics::Histogram updateQueueLength;
+      statistics::Histogram numPropagatesHist;
+    };
+
+    PushStats stats;
+
+  protected:
+    virtual void recvMemRetry();
+    virtual bool handleMemResp(PacketPtr pkt);
+
+  public:
+    PARAMS(PushEngine);
+    PushEngine(const Params& params);
+    Port& getPort(const std::string& if_name,
+                PortID idx = InvalidPortID) override;
+    virtual void init() override;
+    void registerMPU(MPU* mpu);
+
+    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
+    virtual void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
+
+    void start();
+    bool running() { return _running; }
+    void recvVertexPush(Addr addr, uint32_t delta,
+                        uint32_t edge_index, uint32_t degree);
+
+    void recvReqRetry();
+
+    bool done();
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
diff --git a/src/accl/graph/sega/state_machine.md b/src/accl/graph/sega/state_machine.md
new file mode 100644
index 0000000000..203c47cf02
--- /dev/null
+++ b/src/accl/graph/sega/state_machine.md
@@ -0,0 +1 @@
+# CoalesceEngine Block state machine
\ No newline at end of file
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
new file mode 100644
index 0000000000..b4649b6a9d
--- /dev/null
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/wl_engine.hh"
+
+#include "accl/graph/sega/mpu.hh"
+#include "debug/SEGAStructureSize.hh"
+#include "debug/WLEngine.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+WLEngine::WLEngine(const WLEngineParams& params):
+    BaseReduceEngine(params),
+    updateQueueSize(params.update_queue_size),
+    registerFileSize(params.register_file_size),
+    nextReadEvent([this]{ processNextReadEvent(); }, name()),
+    nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
+    nextDoneSignalEvent([this] { processNextDoneSignalEvent(); }, name()),
+    stats(*this)
+{
+    for (int i = 0; i < params.port_in_ports_connection_count; ++i) {
+        inPorts.emplace_back(
+                            name() + ".in_ports" + std::to_string(i), this, i);
+    }
+}
+
+Port&
+WLEngine::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "in_ports") {
+        return inPorts[idx];
+    } else {
+        return ClockedObject::getPort(if_name, idx);
+    }
+}
+
+void
+WLEngine::init()
+{
+    for (int i = 0; i < inPorts.size(); i++){
+        inPorts[i].sendRangeChange();
+    }
+}
+
+void
+WLEngine::registerMPU(MPU* mpu)
+{
+    owner = mpu;
+}
+
+AddrRangeList
+WLEngine::getAddrRanges()
+{
+    return owner->getAddrRanges();
+}
+
+void
+WLEngine::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+AddrRangeList
+WLEngine::RespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
+}
+
+void
+WLEngine::RespPort::checkRetryReq()
+{
+    if (needSendRetryReq) {
+        needSendRetryReq = false;
+        sendRetryReq();
+    }
+}
+
+bool
+WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
+{
+    if (!owner->handleIncomingUpdate(pkt)) {
+        needSendRetryReq = true;
+        return false;
+    }
+
+    return true;
+}
+
+Tick
+WLEngine::RespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+void
+WLEngine::RespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+void
+WLEngine::RespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
+void
+WLEngine::checkRetryReq()
+{
+    for (int i = 0; i < inPorts.size(); i++) {
+        inPorts[i].checkRetryReq();
+    }
+}
+
+bool
+WLEngine::done()
+{
+    return registerFile.empty() && updateQueue.empty();
+}
+
+bool
+WLEngine::handleIncomingUpdate(PacketPtr pkt)
+{
+    int slice_number = (int)(pkt->getAddr()/(owner->getSliceSize()));
+    if (slice_number != owner->getSliceCounter()) {
+        DPRINTF(WLEngine, "%s: Packet %lu slice number is: %d. The current "
+                "slice number is: %d, The total number of vertices/slice: %d \n", 
+                __func__, pkt->getAddr(), slice_number, 
+                owner->getSliceCounter(), 
+                owner->getSliceSize()/sizeof(WorkListItem));
+        bool ret = owner->bufferRemoteUpdate(slice_number, pkt);
+        if (done() && !nextDoneSignalEvent.scheduled()) {
+            schedule(nextDoneSignalEvent, nextCycle());
+        }
+        return ret;
+    }
+    assert((updateQueueSize == 0) || (updateQueue.size() <= updateQueueSize));
+    if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
+        return false;
+    }
+
+    updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>(), curTick());
+    DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the "
+                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
+                updateQueue.size(), updateQueueSize);
+    DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the "
+                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
+                updateQueue.size(), updateQueueSize);
+
+    // delete the packet since it's not needed anymore.
+    delete pkt;
+
+    if (!nextReadEvent.scheduled()) {
+        schedule(nextReadEvent, nextCycle());
+    }
+    return true;
+}
+
+
+// TODO: Parameterize the number of pops WLEngine can do at a time.
+// TODO: Add a histogram stats of the size of the updateQueue. Sample here.
+void
+WLEngine::processNextReadEvent()
+{
+    Addr update_addr;
+    uint32_t update_value;
+    Tick enter_tick;
+    std::tie(update_addr, update_value, enter_tick) = updateQueue.front();
+
+    DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. "
+            "(addr: %lu, value: %u).\n", __func__, update_addr, update_value);
+
+    if ((registerFile.find(update_addr) == registerFile.end())) {
+        DPRINTF(WLEngine,  "%s: No register already allocated for addr: %lu "
+                            "in registerFile.\n", __func__, update_addr);
+        if (registerFile.size() < registerFileSize) {
+            DPRINTF(WLEngine, "%s: There are free registers available in the "
+                                            "registerFile.\n", __func__);
+            ReadReturnStatus read_status = owner->recvWLRead(update_addr);
+            if (read_status == ReadReturnStatus::ACCEPT) {
+                DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read "
+                            "request to addr: %lu.\n", __func__, update_addr);
+                registerFile[update_addr] = update_value;
+                DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, value: %u) "
+                        "to registerFile. registerFile.size = %d, "
+                        "registerFileSize = %d.\n", __func__, update_addr,
+                        update_value, registerFile.size(), registerFileSize);
+                DPRINTF(WLEngine, "%s: Added (addr: %lu, value: %u) "
+                        "to registerFile. registerFile.size = %d, "
+                        "registerFileSize = %d.\n", __func__, update_addr,
+                        update_value, registerFile.size(), registerFileSize);
+                updateQueue.pop_front();
+                stats.updateQueueLatency.sample(
+                        (curTick() - enter_tick) * 1e9 / getClockFrequency());
+                DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
+                            "from updateQueue. updateQueue.size = %d. "
+                            "updateQueueSize = %d.\n", __func__, update_addr,
+                            update_value, updateQueue.size(), updateQueueSize);
+                DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
+                            "from updateQueue. updateQueue.size = %d. "
+                            "updateQueueSize = %d.\n", __func__, update_addr,
+                            update_value, updateQueue.size(), updateQueueSize);
+                checkRetryReq();
+                vertexReadTime[update_addr] = curTick();
+            } else {
+                if (read_status == ReadReturnStatus::REJECT_ROLL) {
+                    updateQueue.pop_front();
+                    updateQueue.emplace_back(
+                                        update_addr, update_value, enter_tick);
+                    DPRINTF(WLEngine, "%s: Received a reject from cache. "
+                                        "Rolling the update.\n", __func__);
+                    stats.numUpdateRolls++;
+                } else {
+                    DPRINTF(WLEngine, "%s: Received a reject from cache. "
+                                    "Not rolling the update.\n", __func__);
+                }
+            }
+        } else {
+            DPRINTF(WLEngine, "%s: There are no free registers "
+                    "available in the registerFile.\n", __func__);
+            stats.registerShortage++;
+        }
+    } else {
+        DPRINTF(WLEngine,  "%s: A register has already been allocated for "
+                    "addr: %lu in registerFile. registerFile[%lu] = %u.\n",
+                __func__, update_addr, update_addr, registerFile[update_addr]);
+        registerFile[update_addr] =
+                graphWorkload->reduce(update_value, registerFile[update_addr]);
+        DPRINTF(WLEngine,  "%s: Reduced the update_value: %u with the entry in"
+                    " registerFile. registerFile[%lu] = %u.\n", __func__,
+                    update_value, update_addr, registerFile[update_addr]);
+        stats.registerFileCoalesce++;
+        updateQueue.pop_front();
+        stats.updateQueueLatency.sample(
+                        (curTick() - enter_tick) * 1e9 / getClockFrequency());
+        DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
+                            "from updateQueue. updateQueue.size = %d. "
+                            "updateQueueSize = %d.\n", __func__, update_addr,
+                            update_value, updateQueue.size(), updateQueueSize);
+        DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
+                    "from updateQueue. updateQueue.size = %d. "
+                    "updateQueueSize = %d.\n", __func__, update_addr,
+                    update_value, updateQueue.size(), updateQueueSize);
+        checkRetryReq();
+    }
+
+    if (!updateQueue.empty() && (!nextReadEvent.scheduled())) {
+        schedule(nextReadEvent, nextCycle());
+    }
+}
+
+void
+WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
+{
+    assert(workListFile.size() <= registerFileSize);
+
+    workListFile[addr] = wl;
+    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) to "
+                "workListFile. workListFile.size = %d.\n", __func__, addr,
+                graphWorkload->printWorkListItem(wl), workListFile.size());
+    DPRINTF(WLEngine, "%s: Added (addr: %lu, wl: %s) to "
+                "workListFile. workListFile.size = %d.\n", __func__, addr,
+                graphWorkload->printWorkListItem(wl), workListFile.size());
+
+    stats.vertexReadLatency.sample(
+        ((curTick() - vertexReadTime[addr]) * 1e9) / getClockFrequency());
+    vertexReadTime.erase(addr);
+
+    assert(!workListFile.empty());
+    if (!nextReduceEvent.scheduled()) {
+        schedule(nextReduceEvent, nextCycle());
+    }
+}
+
+void
+WLEngine::processNextReduceEvent()
+{
+    for (auto &it : workListFile) {
+        Addr addr = it.first;
+        assert(registerFile.find(addr) != registerFile.end());
+        uint32_t update_value = registerFile[addr];
+        DPRINTF(WLEngine,  "%s: Reducing between registerFile and workListFile"
+                    ". registerFile[%lu] = %u, workListFile[%lu] = %s.\n",
+                    __func__, addr, registerFile[addr], addr,
+                    graphWorkload->printWorkListItem(workListFile[addr]));
+        // TODO: Generalize this to reduce function rather than just min
+        workListFile[addr].tempProp =
+            graphWorkload->reduce(update_value, workListFile[addr].tempProp);
+        DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
+        __func__, addr, graphWorkload->printWorkListItem(workListFile[addr]));
+        stats.numReduce++;
+
+        owner->recvWLWrite(addr, workListFile[addr]);
+        registerFile.erase(addr);
+        DPRINTF(SEGAStructureSize, "%s: Removed addr: %lu from registerFile. "
+                    "registerFile.size = %d, registerFileSize = %d\n",
+                    __func__, addr, registerFile.size(), registerFileSize);
+        DPRINTF(WLEngine, "%s: Removed addr: %lu from registerFile. "
+                    "registerFile.size = %d, registerFileSize = %d\n",
+                    __func__, addr, registerFile.size(), registerFileSize);
+    }
+    workListFile.clear();
+
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
+    }
+}
+
+void
+WLEngine::processNextDoneSignalEvent()
+{
+    if (done()) {
+        owner->recvDoneSignal();
+    }
+}
+
+WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
+    : statistics::Group(&_wl),
+    wl(_wl),
+    ADD_STAT(numReduce, statistics::units::Count::get(),
+             "Number of memory blocks read for vertecies"),
+    ADD_STAT(registerFileCoalesce, statistics::units::Count::get(),
+             "Number of memory blocks read for vertecies"),
+    ADD_STAT(registerShortage, statistics::units::Count::get(),
+             "Number of times updates were "
+             "stalled because of register shortage"),
+    ADD_STAT(numUpdateRolls, statistics::units::Count::get(),
+             "Number of times an update has been rolled back "
+             "to the back of the update queue due to cache reject."),
+    ADD_STAT(vertexReadLatency, statistics::units::Second::get(),
+             "Histogram of the latency of reading a vertex (ns)."),
+    ADD_STAT(updateQueueLatency, statistics::units::Second::get(),
+             "Histogram of the latency of dequeuing an update (ns).")
+{
+}
+
+void
+WLEngine::WorkListStats::regStats()
+{
+    using namespace statistics;
+
+    vertexReadLatency.init(64);
+    updateQueueLatency.init(64);
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
new file mode 100644
index 0000000000..fb147e692a
--- /dev/null
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "accl/graph/base/base_reduce_engine.hh"
+#include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/sega/enums.hh"
+#include "base/statistics.hh"
+#include "params/WLEngine.hh"
+
+namespace gem5
+{
+
+class MPU;
+
+class WLEngine : public BaseReduceEngine
+{
+  private:
+    class RespPort : public ResponsePort
+    {
+      private:
+        WLEngine* owner;
+        bool needSendRetryReq;
+        PortID _id;
+
+      public:
+        RespPort(const std::string& name, WLEngine* owner, PortID id):
+          ResponsePort(name, owner),
+          owner(owner), needSendRetryReq(false), _id(id)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+        PortID id() { return _id; }
+        void checkRetryReq();
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
+    MPU* owner;
+    GraphWorkload* graphWorkload;
+
+    std::vector<RespPort> inPorts;
+
+    int updateQueueSize;
+    std::deque<std::tuple<Addr, uint32_t, Tick>> updateQueue;
+
+    int registerFileSize;
+    std::unordered_map<Addr, uint32_t> registerFile;
+    std::unordered_map<Addr, Tick> vertexReadTime;
+    std::unordered_map<Addr, WorkListItem> workListFile;
+
+    EventFunctionWrapper nextReadEvent;
+    void processNextReadEvent();
+
+    EventFunctionWrapper nextReduceEvent;
+    void processNextReduceEvent();
+
+    EventFunctionWrapper nextDoneSignalEvent;
+    void processNextDoneSignalEvent();
+
+    struct WorkListStats : public statistics::Group
+    {
+      WorkListStats(WLEngine &worklist);
+
+      void regStats() override;
+
+      WLEngine &wl;
+
+      statistics::Scalar numReduce;
+      statistics::Scalar registerFileCoalesce;
+      statistics::Scalar registerShortage;
+      statistics::Scalar numUpdateRolls;
+
+      statistics::Histogram vertexReadLatency;
+      statistics::Histogram updateQueueLatency;
+    };
+
+    WorkListStats stats;
+
+  public:
+    PARAMS(WLEngine);
+    WLEngine(const Params& params);
+    Port& getPort(const std::string& if_name,
+                PortID idx = InvalidPortID) override;
+    virtual void init() override;
+    void registerMPU(MPU* mpu);
+
+    AddrRangeList getAddrRanges();
+    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
+    void recvFunctional(PacketPtr pkt);
+
+    bool handleIncomingUpdate(PacketPtr pkt);
+    void handleIncomingWL(Addr addr, WorkListItem wl);
+
+    void checkRetryReq();
+
+    bool done();
+};
+
+}
+#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh
new file mode 100644
index 0000000000..620e97f654
--- /dev/null
+++ b/src/accl/graph/sega/work_directory.hh
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
+#define __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
+
+#include <iostream>
+
+#include "accl/graph/base/data_structs.hh"
+#include "base/addr_range.hh"
+#include "base/types.hh"
+
+namespace gem5
+{
+
+class WorkDirectory
+{
+  public:
+    virtual int activate(Addr atom_addr) = 0;
+    virtual int deactivate(Addr atom_addr) = 0;
+    virtual Addr getNextWork() = 0;
+
+    virtual int workCount() = 0;
+    bool empty() { return workCount() == 0; }
+
+    virtual void setLastAtomAddr(Addr atom_addr) = 0;
+};
+
+class PopCountDirectory: public WorkDirectory
+{
+  private:
+    AddrRange memoryRange;
+
+    int numAtomsPerBlock;
+    int memoryAtomSize;
+    int blockSize;
+
+    uint32_t _workCount;
+
+    int numCounters;
+    int lastCounterIndex;
+    uint32_t* popCount;
+
+    uint32_t prevIndex;
+    uint32_t currentCounter;
+
+    UniqueFIFO<int> activeBlockIndices;
+
+    int getIndexFromAtomAddr(Addr atom_addr)
+    {
+        assert((atom_addr % memoryAtomSize) == 0);
+        Addr trimmed_addr = memoryRange.removeIntlvBits(atom_addr);
+        int index = (int) (trimmed_addr / blockSize);
+        return index;
+    }
+
+    Addr getAtomAddrFromIndex(int block_index, int atom_index)
+    {
+        Addr block_addr = block_index * blockSize;
+        Addr trimmed_addr = block_addr + atom_index * memoryAtomSize;
+        return memoryRange.addIntlvBits(trimmed_addr);
+    }
+
+  public:
+    PopCountDirectory(AddrRange mem_range, int atoms_per_block, int atom_size):
+        WorkDirectory(),
+        memoryRange(mem_range), numAtomsPerBlock(atoms_per_block),
+        memoryAtomSize(atom_size), _workCount(0),
+        prevIndex(-1), currentCounter(0)
+    {
+        blockSize = numAtomsPerBlock * memoryAtomSize;
+        int numCounters = (int) (memoryRange.size() / blockSize);
+        lastCounterIndex = numCounters - 1;
+        popCount = new uint32_t [numCounters];
+        for (int index = 0; index < numCounters; index++) {
+            popCount[index] = 0;
+        }
+        activeBlockIndices = UniqueFIFO<int>(numCounters);
+    }
+
+    // CAUTION: This should only be called when the work
+    // directory **is not** tracking the the atom with atom_addr
+    virtual int activate(Addr atom_addr)
+    {
+        int index = getIndexFromAtomAddr(atom_addr);
+        uint32_t prev_count = popCount[index];
+        popCount[index]++;
+        _workCount++;
+        activeBlockIndices.push_back(index);
+        assert(popCount[index] > prev_count);
+        assert(popCount[index] <= numAtomsPerBlock);
+        return popCount[index];
+    }
+
+    // CAUTION: This should only be called when the work
+    // directory **is** tracking the the atom with atom_addr
+    virtual int deactivate(Addr atom_addr)
+    {
+        int index = getIndexFromAtomAddr(atom_addr);
+        uint32_t prev_count = popCount[index];
+        popCount[index]--;
+        _workCount--;
+        if (popCount[index] == 0) {
+            activeBlockIndices.erase(index);
+        }
+        assert(popCount[index] < prev_count);
+        assert(popCount[index] <= numAtomsPerBlock);
+        return popCount[index];
+    }
+
+    virtual int workCount() { return _workCount; }
+
+    void setLastAtomAddr(Addr atom_addr)
+    {
+        lastCounterIndex = getIndexFromAtomAddr(atom_addr);
+    }
+
+    // CAUTION: This directory only tracks active vertices in the memory
+    // and it does not have any information on the state of the cache and/or
+    // the active buffer or the write buffer. Therefore, it might generate a
+    // read request to an address that might be in any of those. In that case,
+    // the generated address should be ignored.
+    virtual Addr getNextWork()
+    {
+        // Why ask directory if it's empty?
+        assert(!activeBlockIndices.empty());
+        int front_index = activeBlockIndices.front();
+        assert(popCount[front_index] > 0);
+        if ((prevIndex != -1) && (prevIndex != front_index)) {
+            currentCounter = 0;
+        }
+        if (currentCounter == numAtomsPerBlock) {
+            currentCounter = 0;
+            activeBlockIndices.pop_front();
+            activeBlockIndices.push_back(front_index);
+        }
+        int current_index = activeBlockIndices.front();
+        Addr ret_addr = getAtomAddrFromIndex(current_index, currentCounter);
+        prevIndex = current_index;
+        currentCounter++;
+        return ret_addr;
+    }
+};
+
+} // namespace gem5
+
+#endif // __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
diff --git a/src/base/addr_range.hh b/src/base/addr_range.hh
index 07bd255d26..3c5c150b29 100644
--- a/src/base/addr_range.hh
+++ b/src/base/addr_range.hh
@@ -48,6 +48,7 @@
 
 #include "base/bitfield.hh"
 #include "base/cprintf.hh"
+#include "base/intmath.hh"
 #include "base/logging.hh"
 #include "base/types.hh"
 
@@ -732,6 +733,40 @@ class AddrRange
     {
         return !(*this == r);
     }
+
+    friend AddrRange
+    mergePseudoChannelRanges(AddrRange left, AddrRange right, int pch_bit)
+    {
+        assert(left.interleaved());
+        assert(right.interleaved());
+        assert(left.mergesWith(right));
+
+        uint8_t old_left_match = left.intlvMatch;
+        uint8_t new_left_match = 0;
+        uint8_t old_right_match = right.intlvMatch;
+        uint8_t new_right_match = 0;
+        int new_bits = left.masks.size() - 1;
+
+        // assumption: masks is sorted in ascending order
+        std::vector<Addr> new_masks;
+        for (auto mask: left.masks) {
+            uint64_t lsb_mask = (mask ^ (mask - 1)) + 1;
+            if ((lsb_mask >> 1) != (1 << pch_bit)) {
+                new_masks.push_back(mask);
+                new_left_match |= ((old_left_match & 1) << new_bits);
+                new_left_match >>= 1;
+                new_right_match |= ((old_right_match & 1) << new_bits);
+                new_right_match >>= 1;
+            }
+            old_left_match >>= 1;
+            old_right_match >>= 1;
+        }
+        panic_if(new_left_match != new_right_match,
+                    "The two ranges can not be a pseudo channel pair "
+                    "given the pseudochannel bit position of params.pch_bit.");
+
+        return AddrRange(left._start, left._end, new_masks, new_left_match);
+    }
 };
 
 static inline AddrRangeList
@@ -817,6 +852,16 @@ RangeSize(Addr start, Addr size)
     return AddrRange(start, start + size);
 }
 
+inline bool
+contains(AddrRangeList range_list, Addr addr)
+{
+    bool ret = false;
+    for (auto range: range_list) {
+        ret |= range.contains(addr);
+    }
+    return ret;
+}
+
 } // namespace gem5
 
 #endif // __BASE_ADDR_RANGE_HH__
diff --git a/src/base/statistics.hh b/src/base/statistics.hh
index 24cbf714f5..15aeff892e 100644
--- a/src/base/statistics.hh
+++ b/src/base/statistics.hh
@@ -1052,7 +1052,7 @@ class VectorBase : public DataWrapVec<Derived, VectorInfoProxy>
     Proxy
     operator[](off_type index)
     {
-        assert (index < size());
+        // assert (index < size());
         return Proxy(this->self(), index);
     }
 };
diff --git a/src/mem/HBMCtrl.py b/src/mem/HBMCtrl.py
index 0c7c1ea919..f7355d4b67 100644
--- a/src/mem/HBMCtrl.py
+++ b/src/mem/HBMCtrl.py
@@ -42,6 +42,8 @@ class HBMCtrl(MemCtrl):
     # HBMCtrl has been tested with two HBM_2000_4H_1x64 interfaces
     dram_2 = Param.DRAMInterface("DRAM memory interface")
 
+    pch_bit = Param.Int("Position of PseudoChannel bit in addresses.")
+
     # For mixed traffic, HBMCtrl with HBM_2000_4H_1x64 interfaaces
     # gives the best results with following min_r/w_per_switch
     min_reads_per_switch = 64
diff --git a/src/mem/hbm_ctrl.cc b/src/mem/hbm_ctrl.cc
index 99618c4b5f..efd46bbd54 100644
--- a/src/mem/hbm_ctrl.cc
+++ b/src/mem/hbm_ctrl.cc
@@ -45,6 +45,7 @@ namespace memory
 
 HBMCtrl::HBMCtrl(const HBMCtrlParams &p) :
     MemCtrl(p),
+    pchBit(p.pch_bit),
     retryRdReqPC1(false), retryWrReqPC1(false),
     nextReqEventPC1([this] {processNextReqEvent(pc1Int, respQueuePC1,
                          respondEventPC1, nextReqEventPC1, retryWrReqPC1);},
@@ -233,7 +234,7 @@ HBMCtrl::recvTimingReq(PacketPtr pkt)
     bool is_pc0;
 
     // TODO: make the interleaving bit across pseudo channels a parameter
-    if (bits(pkt->getAddr(), 6) == 0) {
+    if (bits(pkt->getAddr(), pchBit) == 0) {
         is_pc0 = true;
     } else {
         is_pc0 = false;
@@ -492,8 +493,11 @@ AddrRangeList
 HBMCtrl::getAddrRanges()
 {
     AddrRangeList ranges;
-    ranges.push_back(pc0Int->getAddrRange());
-    ranges.push_back(pc1Int->getAddrRange());
+    AddrRange pc0Int_range = pc0Int->getAddrRange();
+    AddrRange pc1Int_range = pc1Int->getAddrRange();
+    ranges.push_back(
+                mergePseudoChannelRanges(pc0Int_range, pc1Int_range, pchBit)
+                    );
     return ranges;
 }
 
diff --git a/src/mem/hbm_ctrl.hh b/src/mem/hbm_ctrl.hh
index c9045f0ae7..f204b8346f 100644
--- a/src/mem/hbm_ctrl.hh
+++ b/src/mem/hbm_ctrl.hh
@@ -72,7 +72,8 @@ class HBMCtrl : public MemCtrl
     }
 
   private:
-
+    // Position of the pseudochannel bit in addresses.
+    int pchBit;
     /**
      * Remember if we have to retry a request for second pseudo channel.
      */
diff --git a/src/mem/mem_ctrl.cc b/src/mem/mem_ctrl.cc
index c65d68a5a7..3cbacef800 100644
--- a/src/mem/mem_ctrl.cc
+++ b/src/mem/mem_ctrl.cc
@@ -212,7 +212,7 @@ MemCtrl::addToReadQueue(PacketPtr pkt,
     for (int cnt = 0; cnt < pkt_count; ++cnt) {
         unsigned size = std::min((addr | (burst_size - 1)) + 1,
                         base_addr + pkt->getSize()) - addr;
-        stats.readPktSize[ceilLog2(size)]++;
+        // stats.readPktSize[ceilLog2(size)]++;
         stats.readBursts++;
         stats.requestorReadAccesses[pkt->requestorId()]++;
 
diff --git a/src/mem/packet.cc b/src/mem/packet.cc
index 31dc330cab..daf9d18e88 100644
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -237,6 +237,7 @@ MemCmd::commandInfo[] =
     { {IsRead, IsResponse}, InvalidCmd, "HTMReqResp" },
     { {IsRead, IsRequest}, InvalidCmd, "HTMAbort" },
     { {IsRequest}, InvalidCmd, "TlbiExtSync" },
+    { {IsRequest, HasData}, InvalidCmd, "UpdateWL"}
 };
 
 AddrRange
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 9238dbec00..5332ee32a2 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -148,6 +148,8 @@ class MemCmd
         HTMAbort,
         // Tlb shootdown
         TlbiExtSync,
+        // MPU Accelerator
+        UpdateWL,
         NUM_MEM_CMDS
     };
 
diff --git a/src/mem/port_proxy.cc b/src/mem/port_proxy.cc
index 19e1a53e84..55145ab7d7 100644
--- a/src/mem/port_proxy.cc
+++ b/src/mem/port_proxy.cc
@@ -56,7 +56,7 @@ PortProxy::PortProxy(const RequestPort &port, unsigned int cache_line_size) :
 
 void
 PortProxy::readBlobPhys(Addr addr, Request::Flags flags,
-                        void *p, int size) const
+                        void *p, Addr size) const
 {
     for (ChunkGenerator gen(addr, size, _cacheLineSize); !gen.done();
          gen.next()) {
@@ -73,7 +73,7 @@ PortProxy::readBlobPhys(Addr addr, Request::Flags flags,
 
 void
 PortProxy::writeBlobPhys(Addr addr, Request::Flags flags,
-                         const void *p, int size) const
+                         const void *p, Addr size) const
 {
     for (ChunkGenerator gen(addr, size, _cacheLineSize); !gen.done();
          gen.next()) {
@@ -90,7 +90,7 @@ PortProxy::writeBlobPhys(Addr addr, Request::Flags flags,
 
 void
 PortProxy::memsetBlobPhys(Addr addr, Request::Flags flags,
-                          uint8_t v, int size) const
+                          uint8_t v, Addr size) const
 {
     // quick and dirty...
     uint8_t *buf = new uint8_t[size];
diff --git a/src/mem/port_proxy.hh b/src/mem/port_proxy.hh
index 29f6ba60a4..8cd21322ea 100644
--- a/src/mem/port_proxy.hh
+++ b/src/mem/port_proxy.hh
@@ -120,19 +120,19 @@ class PortProxy : FunctionalRequestProtocol
      * Read size bytes memory at physical address and store in p.
      */
     void readBlobPhys(Addr addr, Request::Flags flags,
-                      void *p, int size) const;
+                      void *p, Addr size) const;
 
     /**
      * Write size bytes from p to physical address.
      */
     void writeBlobPhys(Addr addr, Request::Flags flags,
-                       const void *p, int size) const;
+                       const void *p, Addr size) const;
 
     /**
      * Fill size bytes starting at physical addr with byte value val.
      */
     void memsetBlobPhys(Addr addr, Request::Flags flags,
-                        uint8_t v, int size) const;
+                        uint8_t v, Addr size) const;
 
 
 
@@ -143,7 +143,7 @@ class PortProxy : FunctionalRequestProtocol
      * Returns true on success and false on failure.
      */
     virtual bool
-    tryReadBlob(Addr addr, void *p, int size) const
+    tryReadBlob(Addr addr, void *p, Addr size) const
     {
         readBlobPhys(addr, 0, p, size);
         return true;
@@ -154,7 +154,7 @@ class PortProxy : FunctionalRequestProtocol
      * Returns true on success and false on failure.
      */
     virtual bool
-    tryWriteBlob(Addr addr, const void *p, int size) const
+    tryWriteBlob(Addr addr, const void *p, Addr size) const
     {
         writeBlobPhys(addr, 0, p, size);
         return true;
@@ -165,7 +165,7 @@ class PortProxy : FunctionalRequestProtocol
      * Returns true on success and false on failure.
      */
     virtual bool
-    tryMemsetBlob(Addr addr, uint8_t val, int size) const
+    tryMemsetBlob(Addr addr, uint8_t val, Addr size) const
     {
         memsetBlobPhys(addr, 0, val, size);
         return true;
@@ -179,7 +179,7 @@ class PortProxy : FunctionalRequestProtocol
      * Same as tryReadBlob, but insists on success.
      */
     void
-    readBlob(Addr addr, void *p, int size) const
+    readBlob(Addr addr, void *p, Addr size) const
     {
         if (!tryReadBlob(addr, p, size))
             fatal("readBlob(%#x, ...) failed", addr);
@@ -189,7 +189,7 @@ class PortProxy : FunctionalRequestProtocol
      * Same as tryWriteBlob, but insists on success.
      */
     void
-    writeBlob(Addr addr, const void *p, int size) const
+    writeBlob(Addr addr, const void *p, Addr size) const
     {
         if (!tryWriteBlob(addr, p, size))
             fatal("writeBlob(%#x, ...) failed", addr);
@@ -199,7 +199,7 @@ class PortProxy : FunctionalRequestProtocol
      * Same as tryMemsetBlob, but insists on success.
      */
     void
-    memsetBlob(Addr addr, uint8_t v, int size) const
+    memsetBlob(Addr addr, uint8_t v, Addr size) const
     {
         if (!tryMemsetBlob(addr, v, size))
             fatal("memsetBlob(%#x, ...) failed", addr);
diff --git a/src/mem/translating_port_proxy.cc b/src/mem/translating_port_proxy.cc
index 8ab859f40d..bc698c1a07 100644
--- a/src/mem/translating_port_proxy.cc
+++ b/src/mem/translating_port_proxy.cc
@@ -86,7 +86,7 @@ TranslatingPortProxy::tryOnBlob(BaseMMU::Mode mode, TranslationGenPtr gen,
 }
 
 bool
-TranslatingPortProxy::tryReadBlob(Addr addr, void *p, int size) const
+TranslatingPortProxy::tryReadBlob(Addr addr, void *p, Addr size) const
 {
     constexpr auto mode = BaseMMU::Read;
     return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional(
@@ -99,7 +99,7 @@ TranslatingPortProxy::tryReadBlob(Addr addr, void *p, int size) const
 
 bool
 TranslatingPortProxy::tryWriteBlob(
-        Addr addr, const void *p, int size) const
+        Addr addr, const void *p, Addr size) const
 {
     constexpr auto mode = BaseMMU::Write;
     return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional(
@@ -111,7 +111,7 @@ TranslatingPortProxy::tryWriteBlob(
 }
 
 bool
-TranslatingPortProxy::tryMemsetBlob(Addr addr, uint8_t v, int size) const
+TranslatingPortProxy::tryMemsetBlob(Addr addr, uint8_t v, Addr size) const
 {
     constexpr auto mode = BaseMMU::Write;
     return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional(
diff --git a/src/mem/translating_port_proxy.hh b/src/mem/translating_port_proxy.hh
index bedb57a3ce..7e619784b1 100644
--- a/src/mem/translating_port_proxy.hh
+++ b/src/mem/translating_port_proxy.hh
@@ -77,16 +77,16 @@ class TranslatingPortProxy : public PortProxy
 
     /** Version of tryReadblob that translates virt->phys and deals
       * with page boundries. */
-    bool tryReadBlob(Addr addr, void *p, int size) const override;
+    bool tryReadBlob(Addr addr, void *p, Addr size) const override;
 
     /** Version of tryWriteBlob that translates virt->phys and deals
       * with page boundries. */
-    bool tryWriteBlob(Addr addr, const void *p, int size) const override;
+    bool tryWriteBlob(Addr addr, const void *p, Addr size) const override;
 
     /**
      * Fill size bytes starting at addr with byte value val.
      */
-    bool tryMemsetBlob(Addr address, uint8_t  v, int size) const override;
+    bool tryMemsetBlob(Addr address, uint8_t  v, Addr size) const override;
 };
 
 } // namespace gem5