diff --git a/configs/accl/async-pr.py b/configs/accl/async-pr.py new file mode 100644 index 0000000000..0bfb6caeaa --- /dev/null +++ b/configs/accl/async-pr.py @@ -0,0 +1,125 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("num_registers", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("alpha", type=float) + argparser.add_argument("threshold", type=float) + argparser.add_argument( + "--simple", + dest="simple", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.num_registers, + args.cache_size, + args.graph, + args.alpha, + args.threshold, + args.simple, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + num_registers, + cache_size, + graph, + alpha, + threshold, + simple, + sample, + verify, + ) = get_inputs() + + if simple: + from sega_simple import SEGA + else: + from sega import SEGA + system = SEGA(num_gpts, num_registers, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.set_async_mode() + system.create_pop_count_directory(64) + system.create_async_pr_workload(alpha, threshold) + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if verify: + system.print_answer() diff --git a/configs/accl/bc.py b/configs/accl/bc.py new file mode 100644 index 0000000000..56faeb3e4d --- /dev/null +++ b/configs/accl/bc.py @@ -0,0 +1,131 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("num_registers", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("init_addr", type=int) + argparser.add_argument("init_value", type=int) + argparser.add_argument( + "--simple", + dest="simple", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.num_registers, + args.cache_size, + args.graph, + args.init_addr, + args.init_value, + args.simple, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + num_registers, + cache_size, + graph, + init_addr, + init_value, + simple, + sample, + verify, + ) = get_inputs() + + if simple: + from sega_simple import SEGA + else: + from sega import SEGA + system = SEGA(num_gpts, num_registers, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.set_bsp_mode() + system.create_pop_count_directory(64) + system.create_bc_workload(init_addr, init_value) + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + iterations = 0 + while True: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + iterations += 1 + if system.work_count() == 0: + break + print(f"#iterations: {iterations}") + if verify: + system.print_answer() diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py new file mode 100644 index 0000000000..97f1b5dc21 --- /dev/null +++ b/configs/accl/bfs.py @@ -0,0 +1,138 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("num_registers", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("init_addr", type=int) + argparser.add_argument("init_value", type=int) + argparser.add_argument( + "--visited", + dest="visited", + action="store_const", + const=True, + default=False, + help="Use visitation version of BFS", + ) + argparser.add_argument( + "--simple", + dest="simple", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.num_registers, + args.cache_size, + args.graph, + args.init_addr, + args.init_value, + args.visited, + args.simple, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + num_registers, + cache_size, + graph, + init_addr, + init_value, + visited, + simple, + sample, + verify, + ) = get_inputs() + + if simple: + from sega_simple import SEGA + else: + from sega import SEGA + system = SEGA(num_gpts, num_registers, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.set_async_mode() + system.create_pop_count_directory(64) + if visited: + system.create_bfs_visited_workload(init_addr, init_value) + else: + system.create_bfs_workload(init_addr, init_value) + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if verify: + system.print_answer() diff --git a/configs/accl/cc.py b/configs/accl/cc.py new file mode 100644 index 0000000000..9b6d2b587d --- /dev/null +++ b/configs/accl/cc.py @@ -0,0 +1,119 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("num_registers", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument( + "--simple", + dest="simple", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.num_registers, + args.cache_size, + args.graph, + args.simple, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + num_registers, + cache_size, + graph, + simple, + sample, + verify, + ) = get_inputs() + + if simple: + from sega_simple import SEGA + else: + from sega import SEGA + system = SEGA(num_gpts, num_registers, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.set_async_mode() + system.create_pop_count_directory(64) + system.create_cc_workload() + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if verify: + system.print_answer() diff --git a/configs/accl/pr.py b/configs/accl/pr.py new file mode 100644 index 0000000000..569514eb82 --- /dev/null +++ b/configs/accl/pr.py @@ -0,0 +1,142 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("num_registers", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("iterations", type=int) + argparser.add_argument("alpha", type=float) + argparser.add_argument("--num_nodes", type=int, default=1) + argparser.add_argument("--error_threshold", type=float, default=0.0) + argparser.add_argument( + "--simple", + dest="simple", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.num_registers, + args.cache_size, + args.graph, + args.iterations, + args.alpha, + args.num_nodes, + args.error_threshold, + args.simple, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + num_registers, + cache_size, + graph, + iterations, + alpha, + num_nodes, + error_threshold, + simple, + sample, + verify, + ) = get_inputs() + + print(f"error_threshold: {error_threshold}") + + if simple: + from sega_simple import SEGA + else: + from sega import SEGA + system = SEGA(num_gpts, num_registers, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.set_bsp_mode() + system.create_pop_count_directory(64) + system.create_pr_workload(num_nodes, alpha) + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + iteration = 0 + while iteration < iterations: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + iteration += 1 + print(f"error: {system.get_pr_error()}") + if system.get_pr_error() < error_threshold: + break + if system.work_count() == 0: + break + print(f"#iterations: {iteration}") + if verify: + system.print_answer() diff --git a/configs/accl/sega.py b/configs/accl/sega.py new file mode 100644 index 0000000000..32d0dd26ab --- /dev/null +++ b/configs/accl/sega.py @@ -0,0 +1,217 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from math import log +from m5.objects import * + + +def interleave_addresses(plain_range, num_channels, cache_line_size): + intlv_low_bit = log(cache_line_size, 2) + intlv_bits = log(num_channels, 2) + ret = [] + for i in range(num_channels): + ret.append( + AddrRange( + start=plain_range.start, + size=plain_range.size(), + intlvHighBit=intlv_low_bit + intlv_bits - 1, + xorHighBit=0, + intlvBits=intlv_bits, + intlvMatch=i, + ) + ) + return ret, intlv_low_bit + intlv_bits - 1 + + +class GPT(SubSystem): + def __init__(self, register_file_size: int, cache_size: str): + super().__init__() + self.wl_engine = WLEngine( + update_queue_size=64, register_file_size=register_file_size + ) + self.coalesce_engine = CoalesceEngine( + attached_memory_atom_size=32, + cache_size=cache_size, + max_resp_per_cycle=8, + pending_pull_limit=64, + active_buffer_size=80, + post_push_wb_queue_size=64, + ) + self.push_engine = PushEngine( + push_req_queue_size=32, + attached_memory_atom_size=64, + resp_queue_size=4096, + max_propagates_per_cycle=8, + update_queue_size=32, + ) + + self.vertex_mem_ctrl = HBMCtrl( + dram=HBM_2000_4H_1x64( + page_policy="close", read_buffer_size=96, write_buffer_size=96 + ), + dram_2=HBM_2000_4H_1x64( + page_policy="close", read_buffer_size=96, write_buffer_size=96 + ), + ) + self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port + + self.mpu = MPU( + wl_engine=self.wl_engine, + coalesce_engine=self.coalesce_engine, + push_engine=self.push_engine, + ) + + def getRespPort(self): + return self.wl_engine.in_ports + + def setRespPort(self, port): + self.wl_engine.in_ports = port + + def getReqPort(self): + return self.push_engine.out_ports + + def setReqPort(self, port): + self.push_engine.out_ports = port + + def getEdgeMemPort(self): + return self.push_engine.mem_port + + def setEdgeMemPort(self, port): + self.push_engine.mem_port = port + + def set_vertex_range(self, vertex_ranges): + self.vertex_mem_ctrl.dram.range = vertex_ranges[0] + self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1] + + def set_vertex_pch_bit(self, pch_bit): + self.vertex_mem_ctrl.pch_bit = pch_bit + +class EdgeMemory(SubSystem): + def __init__(self, size: str): + super(EdgeMemory, self).__init__() + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "2.4GHz" + self.clk_domain.voltage_domain = VoltageDomain() + + self.mem_ctrl = MemCtrl( + dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False) + ) + self.xbar = NoncoherentXBar( + width=64, frontend_latency=1, forward_latency=1, response_latency=1 + ) + self.xbar.mem_side_ports = self.mem_ctrl.port + + def set_image(self, image): + self.mem_ctrl.dram.image_file = image + + def getPort(self): + return self.xbar.cpu_side_ports + + def setPort(self, port): + self.xbar.cpu_side_ports = port + +class SEGA(System): + def __init__(self, num_gpts, num_registers, cache_size, graph_path): + super(SEGA, self).__init__() + # num_gpts should be an even power of 2 + assert num_gpts != 0 + assert num_gpts % 2 == 0 + assert (num_gpts & (num_gpts - 1)) == 0 + + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "2GHz" + self.clk_domain.voltage_domain = VoltageDomain() + self.cache_line_size = 32 + self.mem_mode = "timing" + + # Building the CenteralController + self.ctrl = CenteralController(vertex_image_file=f"{graph_path}/vertices") + # Building the EdgeMemories + edge_mem = [] + for i in range(int(num_gpts/2)): + mem = EdgeMemory("4GiB") + mem.set_image(f"{graph_path}/edgelist_{i}") + edge_mem.append(mem) + self.edge_mem = edge_mem + # Building the GPTs + vertex_ranges, pch_bit = interleave_addresses( + AddrRange(start=0, size="4GiB"), 2 * num_gpts, 32 + ) + gpts = [] + for i in range(num_gpts): + gpt = GPT(num_registers, cache_size) + gpt.set_vertex_range( + [vertex_ranges[i], vertex_ranges[i + num_gpts]] + ) + gpt.set_vertex_pch_bit(pch_bit) + gpt.setEdgeMemPort(self.edge_mem[i % (int(num_gpts/2))].getPort()) + gpts.append(gpt) + # Creating the interconnect among mpus + for gpt_0 in gpts: + for gpt_1 in gpts: + gpt_0.setReqPort(gpt_1.getRespPort()) + self.gpts = gpts + + self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] + + def work_count(self): + return self.ctrl.workCount() + + def set_async_mode(self): + self.ctrl.setAsyncMode() + + def set_bsp_mode(self): + self.ctrl.setBSPMode() + + def create_pop_count_directory(self, atoms_per_block): + self.ctrl.createPopCountDirectory(atoms_per_block) + + def create_bfs_workload(self, init_addr, init_value): + self.ctrl.createBFSWorkload(init_addr, init_value) + + def create_bfs_visited_workload(self, init_addr, init_value): + self.ctrl.createBFSVisitedWorkload(init_addr, init_value) + + def create_sssp_workload(self, init_addr, init_value): + self.ctrl.createSSSPWorkload(init_addr, init_value) + + def create_cc_workload(self): + self.ctrl.createCCWorkload() + + def create_async_pr_workload(self, alpha, threshold): + self.ctrl.createAsyncPRWorkload(alpha, threshold) + + def create_pr_workload(self, num_nodes, alpha): + self.ctrl.createPRWorkload(num_nodes, alpha) + + def get_pr_error(self): + return self.ctrl.getPRError() + + def create_bc_workload(self, init_addr, init_value): + self.ctrl.createBCWorkload(init_addr, init_value) + + def print_answer(self): + self.ctrl.printAnswerToHostSimout() diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py new file mode 100644 index 0000000000..2d36ec584d --- /dev/null +++ b/configs/accl/sega_simple.py @@ -0,0 +1,208 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from math import log +from m5.objects import * + + +def interleave_addresses(plain_range, num_channels, cache_line_size): + intlv_low_bit = log(cache_line_size, 2) + intlv_bits = log(num_channels, 2) + ret = [] + for i in range(num_channels): + ret.append( + AddrRange( + start=plain_range.start, + size=plain_range.size(), + intlvHighBit=intlv_low_bit + intlv_bits - 1, + xorHighBit=0, + intlvBits=intlv_bits, + intlvMatch=i, + ) + ) + return ret + + +class GPT(SubSystem): + def __init__(self, register_file_size: int, cache_size: str): + super().__init__() + self.wl_engine = WLEngine( + update_queue_size=64, register_file_size=register_file_size + ) + self.coalesce_engine = CoalesceEngine( + attached_memory_atom_size=32, + cache_size=cache_size, + max_resp_per_cycle=8, + pending_pull_limit=64, + active_buffer_size=80, + post_push_wb_queue_size=64, + ) + self.push_engine = PushEngine( + push_req_queue_size=32, + attached_memory_atom_size=64, + resp_queue_size=4096, + max_propagates_per_cycle=8, + update_queue_size=32, + ) + + self.vertex_mem_ctrl = SimpleMemory( + latency="122ns", latency_var="0ns", bandwidth="28GiB/s" + ) + self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port + + self.mpu = MPU( + wl_engine=self.wl_engine, + coalesce_engine=self.coalesce_engine, + push_engine=self.push_engine, + ) + + def getRespPort(self): + return self.wl_engine.in_ports + + def setRespPort(self, port): + self.wl_engine.in_ports = port + + def getReqPort(self): + return self.push_engine.out_ports + + def setReqPort(self, port): + self.push_engine.out_ports = port + + def getEdgeMemPort(self): + return self.push_engine.mem_port + + def setEdgeMemPort(self, port): + self.push_engine.mem_port = port + + def set_vertex_range(self, vertex_range): + self.vertex_mem_ctrl.range = vertex_range + + +class EdgeMemory(SubSystem): + def __init__(self, size: str): + super(EdgeMemory, self).__init__() + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "2.4GHz" + self.clk_domain.voltage_domain = VoltageDomain() + + self.mem_ctrl = MemCtrl( + dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False) + ) + self.xbar = NoncoherentXBar( + width=64, frontend_latency=1, forward_latency=1, response_latency=1 + ) + self.xbar.mem_side_ports = self.mem_ctrl.port + + def set_image(self, image): + self.mem_ctrl.dram.image_file = image + + def getPort(self): + return self.xbar.cpu_side_ports + + def setPort(self, port): + self.xbar.cpu_side_ports = port + + +class SEGA(System): + def __init__(self, num_gpts, num_registers, cache_size, graph_path): + super(SEGA, self).__init__() + # num_gpts should be an even power of 2 + assert num_gpts != 0 + assert num_gpts % 2 == 0 + assert (num_gpts & (num_gpts - 1)) == 0 + + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "2GHz" + self.clk_domain.voltage_domain = VoltageDomain() + self.cache_line_size = 32 + self.mem_mode = "timing" + + # Building the CenteralController + self.ctrl = CenteralController( + vertex_image_file=f"{graph_path}/vertices" + ) + # Building the EdgeMemories + edge_mem = [] + for i in range(int(num_gpts / 2)): + mem = EdgeMemory("4GiB") + mem.set_image(f"{graph_path}/edgelist_{i}") + edge_mem.append(mem) + self.edge_mem = edge_mem + # Building the GPTs + vertex_ranges = interleave_addresses( + AddrRange(start=0, size="4GiB"), num_gpts, 32 + ) + gpts = [] + for i in range(num_gpts): + gpt = GPT(num_registers, cache_size) + gpt.set_vertex_range(vertex_ranges[i]) + gpt.setEdgeMemPort( + self.edge_mem[i % (int(num_gpts / 2))].getPort() + ) + gpts.append(gpt) + # Creating the interconnect among mpus + for gpt_0 in gpts: + for gpt_1 in gpts: + gpt_0.setReqPort(gpt_1.getRespPort()) + self.gpts = gpts + + self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] + + def work_count(self): + return self.ctrl.workCount() + + def set_async_mode(self): + self.ctrl.setAsyncMode() + + def set_bsp_mode(self): + self.ctrl.setBSPMode() + + def create_pop_count_directory(self, atoms_per_block): + self.ctrl.createPopCountDirectory(atoms_per_block) + + def create_bfs_workload(self, init_addr, init_value): + self.ctrl.createBFSWorkload(init_addr, init_value) + + def create_bfs_visited_workload(self, init_addr, init_value): + self.ctrl.createBFSVisitedWorkload(init_addr, init_value) + + def create_sssp_workload(self, init_addr, init_value): + self.ctrl.createSSSPWorkload(init_addr, init_value) + + def create_cc_workload(self): + self.ctrl.createCCWorkload() + + def create_async_pr_workload(self, alpha, threshold): + self.ctrl.createAsyncPRWorkload(alpha, threshold) + + def create_pr_workload(self, num_nodes, alpha): + self.ctrl.createPRWorkload(num_nodes, alpha) + + def create_bc_workload(self, init_addr, init_value): + self.ctrl.createBCWorkload(init_addr, init_value) + + def print_answer(self): + self.ctrl.printAnswerToHostSimout() diff --git a/configs/accl/sssp.py b/configs/accl/sssp.py new file mode 100644 index 0000000000..f2e60b856a --- /dev/null +++ b/configs/accl/sssp.py @@ -0,0 +1,125 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("num_registers", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("init_addr", type=int) + argparser.add_argument("init_value", type=int) + argparser.add_argument( + "--simple", + dest="simple", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.num_registers, + args.cache_size, + args.graph, + args.init_addr, + args.init_value, + args.simple, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + num_registers, + cache_size, + graph, + init_addr, + init_value, + simple, + sample, + verify, + ) = get_inputs() + + if simple: + from sega_simple import SEGA + else: + from sega import SEGA + system = SEGA(num_gpts, num_registers, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.set_async_mode() + system.create_pop_count_directory(64) + system.create_sssp_workload(init_addr, init_value) + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if verify: + system.print_answer() diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md new file mode 100644 index 0000000000..ebfca7e794 --- /dev/null +++ b/src/accl/graph/TODO.md @@ -0,0 +1,8 @@ +# TODO Items + +* We might need to revisit the fact that we could insert something to a queue on + the same cycle that another event is consuming something from the queue. +* Move checking for wl.degree == 0 to coalesce engine. +* Fix the retry system between memory queue and coalesce engine +* Update inheritance: There is not enough reason for PushEngine and +CoalesceEngine to be of the same type (i.e. delete BaseMemEngine). diff --git a/src/accl/graph/base/BaseReduceEngine.py b/src/accl/graph/base/BaseReduceEngine.py new file mode 100644 index 0000000000..0585c36e48 --- /dev/null +++ b/src/accl/graph/base/BaseReduceEngine.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.ClockedObject import ClockedObject + +class BaseReduceEngine(ClockedObject): + abstract = True + type = 'BaseReduceEngine' + cxx_header = "accl/graph/base/base_reduce_engine.hh" + cxx_class = 'gem5::BaseReduceEngine' + + system = Param.System(Parent.any, 'System this Engine is a part of') diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript new file mode 100644 index 0000000000..35111c34d2 --- /dev/null +++ b/src/accl/graph/base/SConscript @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2016 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Import("*") + +SimObject("BaseReduceEngine.py", sim_objects=["BaseReduceEngine"]) + +Source("base_reduce_engine.cc") +Source("graph_workload.cc") diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc new file mode 100644 index 0000000000..ade95800d2 --- /dev/null +++ b/src/accl/graph/base/base_reduce_engine.cc @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/base/base_reduce_engine.hh" + +namespace gem5 +{ + +BaseReduceEngine::BaseReduceEngine(const Params ¶ms): + ClockedObject(params), + system(params.system), + _requestorId(system->getRequestorId(this)) +{} + +BaseReduceEngine::~BaseReduceEngine() +{} + +} diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh new file mode 100644 index 0000000000..268bb60b76 --- /dev/null +++ b/src/accl/graph/base/base_reduce_engine.hh @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__ +#define __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__ + +#include "params/BaseReduceEngine.hh" +#include "sim/clocked_object.hh" +#include "sim/system.hh" + +namespace gem5 +{ + +class BaseReduceEngine : public ClockedObject +{ + private: + System* system; + + protected: + + const RequestorID _requestorId; + + public: + PARAMS(BaseReduceEngine); + BaseReduceEngine(const Params ¶ms); + ~BaseReduceEngine(); + + RequestorID requestorId() { return _requestorId; } +}; + +} + +#endif // __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__ diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh new file mode 100644 index 0000000000..a391e0794d --- /dev/null +++ b/src/accl/graph/base/data_structs.hh @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__ +#define __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__ + +#include "base/cprintf.hh" +#include "base/intmath.hh" + +#include +#include +#include + +namespace gem5 +{ + +struct __attribute__ ((packed)) WorkListItem +{ + uint32_t tempProp : 32; + uint32_t prop : 32; + uint32_t edgeIndex : 32; + uint32_t degree : 30; + bool activeNow: 1; + bool activeFuture: 1; + + std::string to_string() + { + return csprintf("WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, " + "degree: %u, activeNow: %s, activeFuture: %s}", + tempProp, prop, edgeIndex, degree, + activeNow ? "true" : "false", + activeFuture ? "true" : "false"); + } + + WorkListItem(): + tempProp(0), + prop(0), + edgeIndex(0), + degree(0), + activeNow(false), + activeFuture(false) + {} + + WorkListItem(uint32_t temp_prop, uint32_t prop, + uint32_t degree, uint32_t edge_index, + bool active_now, bool active_future): + tempProp(temp_prop), prop(prop), edgeIndex(edge_index), degree(degree), + activeNow(active_now), activeFuture(active_future) + {} + +}; + +struct __attribute__ ((packed)) Edge +{ + uint16_t weight : 16; + uint64_t neighbor : 48; + + std::string to_string() + { + return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor); + } + + Edge(): weight(0), neighbor(0) {} + + Edge(uint16_t weight, uint64_t neighbor): + weight(weight), + neighbor(neighbor) + {} +}; + +static_assert(isPowerOf2(sizeof(WorkListItem))); +static_assert(isPowerOf2(sizeof(Edge))); + +struct MetaEdge { + uint64_t src; + uint64_t dst; + uint32_t weight; + uint32_t value; + + MetaEdge(): src(0), dst(0), weight(0), value(0) + {} + MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value): + src(src), dst(dst), weight(weight), value(value) + {} + + std::string to_string() + { + return csprintf("MetaEdge{src: %lu, dst:%lu, weight: %u, value: %u}", + src, dst, weight, value); + } +}; + +struct Update { + uint64_t src; + uint64_t dst; + uint32_t value; + + Update(): src(0), dst(0), value(0) + {} + Update(uint64_t src, uint64_t dst, uint32_t value): + src(src), dst(dst), value(value) + {} + + std::string to_string() + { + return csprintf("Update{src: %lu, dst:%lu, value: %u}", + src, dst, value); + } +}; + +template +class UniqueFIFO +{ + private: + int cap; + int pop; + + int* added; + int* deleted; + std::deque container; + + public: + UniqueFIFO() { + cap = 0; + pop = 0; + added = nullptr; + deleted = nullptr; + } + + UniqueFIFO(int size) { + cap = size; + pop = 0; + + added = (int*) new int [cap]; + deleted = (int*) new int [cap]; + + for (int i = 0; i < cap; i++) { + added[i] = 0; + deleted[i] = 0; + } + container.clear(); + } + + void fix_front() { + while(true) { + T elem = container.front(); + if (deleted[elem] > 0) { + deleted[elem]--; + added[elem]--; + container.pop_front(); + } else { + assert(deleted[elem] == 0); + assert(added[elem] == 1); + break; + } + } + } + + T front() { + fix_front(); + return container.front(); + } + + size_t size() { + return pop; + } + + void clear() { + pop = 0; + for (int i = 0; i < cap; i++) { + added[i] = 0; + deleted[i] = 0; + } + container.clear(); + } + + bool empty() { + return size() == 0; + } + + bool find(T item) { + assert(added[item] >= 0); + assert(deleted[item] >= 0); + int diff = added[item] - deleted[item]; + assert((diff == 0) || (diff == 1)); + return (diff == 1); + } + + void push_back(T item) { + if (!find(item)) { + added[item]++; + pop++; + container.push_back(item); + } + } + + void pop_front() { + T elem = front(); + assert(added[elem] == 1); + added[elem] = 0; + pop--; + container.pop_front(); + } + + void erase(T item) { + assert(find(item)); + deleted[item]++; + pop--; + } + + void operator=(const UniqueFIFO& rhs) { + pop = rhs.pop; + container = rhs.container; + added = rhs.added; + deleted = rhs.deleted; + } +}; + +} + +#endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__ diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc new file mode 100644 index 0000000000..fd802cf275 --- /dev/null +++ b/src/accl/graph/base/graph_workload.cc @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/base/graph_workload.hh" + +#include + +#include "base/cprintf.hh" +#include "base/intmath.hh" + +namespace gem5 +{ + +template +float +writeToFloat(T value) +{ + assert(sizeof(T) == sizeof(float)); + float float_form; + std::memcpy(&float_form, &value, sizeof(float)); + return float_form; +} + +template +T +readFromFloat(float value) +{ + assert(sizeof(T) == sizeof(float)); + T float_bits; + std::memcpy(&float_bits, &value, sizeof(float)); + return float_bits; +} + +void +BFSWorkload::init(PacketPtr pkt, WorkDirectory* dir) +{ + size_t pkt_size = pkt->getSize(); + uint64_t aligned_addr = roundDown(initAddr, pkt_size); + + if (pkt->getAddr() == aligned_addr) { + int num_elements = (int) (pkt_size / sizeof(WorkListItem)); + WorkListItem items[num_elements]; + + pkt->writeDataToBlock((uint8_t*) items, pkt_size); + int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem)); + WorkListItem new_wl = items[index]; + new_wl.tempProp = initValue; + if (activeCondition(new_wl, items[index])) { + new_wl.activeNow = true; + dir->activate(aligned_addr); + } + items[index] = new_wl; + + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt_size); + } +} + +uint32_t +BFSWorkload::reduce(uint32_t update, uint32_t value) +{ + return std::min(update, value); +} + +uint32_t +BFSWorkload::propagate(uint32_t value, uint32_t weight) +{ + return value + 1; +} + +bool +BFSWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl) +{ + return (new_wl.tempProp < old_wl.tempProp) && (old_wl.degree > 0); +} + +uint32_t +BFSWorkload::apply(WorkListItem& wl) +{ + wl.prop = wl.tempProp; + return wl.prop; +} + +std::string +BFSWorkload::printWorkListItem(const WorkListItem wl) +{ + return csprintf( + "WorkListItem{tempProp: %u, prop: %u, degree: %u, " + "edgeIndex: %u, activeNow: %s, activeFuture: %s}", + wl.tempProp, wl.prop, wl.degree, wl.edgeIndex, + wl.activeNow ? "true" : "false", + wl.activeFuture ? "true" : "false"); +} + +uint32_t +BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight) { + return value; +} + +void +CCWorkload::init(PacketPtr pkt, WorkDirectory* dir) +{ + Addr pkt_addr = pkt->getAddr(); + size_t pkt_size = pkt->getSize(); + int num_elements = (int) (pkt_size / sizeof(WorkListItem)); + WorkListItem items[num_elements]; + + pkt->writeDataToBlock((uint8_t*) items, pkt_size); + bool atom_active = false; + for (int i = 0; i < num_elements; i++) { + WorkListItem new_wl = items[i]; + new_wl.tempProp = (int) (pkt_addr / sizeof(WorkListItem)) + i; + new_wl.activeNow = activeCondition(new_wl, items[i]); + atom_active |= new_wl.activeNow; + items[i] = new_wl; + } + if (atom_active) { + dir->activate(pkt->getAddr()); + } + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt_size); +} + +uint32_t +SSSPWorkload::propagate(uint32_t value, uint32_t weight) +{ + return value + weight; +} + +void +PRWorkload::init(PacketPtr pkt, WorkDirectory* dir) +{ + int num_elements = pkt->getSize() / sizeof(WorkListItem); + WorkListItem items[num_elements]; + pkt->writeDataToBlock((uint8_t*) items, pkt->getSize()); + + bool atom_active = false; + for (int index = 0; index < num_elements; index++) { + WorkListItem new_wl = items[index]; + new_wl.tempProp = readFromFloat(0); + new_wl.prop = readFromFloat(1 - alpha); + new_wl.activeNow = activeCondition(new_wl, items[index]); + atom_active |= new_wl.activeNow; + items[index] = new_wl; + } + if (atom_active) { + dir->activate(pkt->getAddr()); + } + + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt->getSize()); +} + +uint32_t +PRWorkload::reduce(uint32_t update, uint32_t value) +{ + float update_float = writeToFloat(update); + float value_float = writeToFloat(value); + return readFromFloat(update_float + value_float); +} + +uint32_t +PRWorkload::propagate(uint32_t value, uint32_t weight) +{ + float value_float = writeToFloat(value); + float weight_float = writeToFloat(weight); + if (weight == 0) { + weight_float = 1.0; + } + return readFromFloat(alpha * value_float * weight_float); +} + +bool +PRWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl) +{ + float temp_float = writeToFloat(new_wl.tempProp); + float prop_float = writeToFloat(new_wl.prop); + float dist = std::abs(temp_float - prop_float); + return (dist >= threshold) && (new_wl.degree > 0); +} + +uint32_t +PRWorkload::apply(WorkListItem& wl) +{ + float temp_float = writeToFloat(wl.tempProp); + float prop_float = writeToFloat(wl.prop); + float delta = (temp_float - prop_float) / wl.degree; + wl.prop = wl.tempProp; + return readFromFloat(delta); +} + +std::string +PRWorkload::printWorkListItem(const WorkListItem wl) +{ + float temp_float = writeToFloat(wl.tempProp); + float prop_float = writeToFloat(wl.prop); + return csprintf( + "WorkListItem{tempProp: %f, prop: %f, degree: %u, " + "edgeIndex: %u, activeNow: %s, activeFuture: %s}", + temp_float, prop_float, wl.degree, wl.edgeIndex, + wl.activeNow ? "true" : "false", + wl.activeFuture ? "true" : "false"); +} + +void +BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir) +{ + size_t pkt_size = pkt->getSize(); + int num_elements = (int) (pkt_size / sizeof(WorkListItem)); + WorkListItem items[num_elements]; + + pkt->writeDataToBlock((uint8_t*) items, pkt_size); + bool atom_active = false; + for (int i = 0; i < num_elements; i++) { + WorkListItem new_wl = items[i]; + new_wl.tempProp = readFromFloat((1 - alpha)/numNodes); + new_wl.prop = readFromFloat(1/numNodes); + new_wl.activeNow = activeCondition(new_wl, items[i]); + atom_active |= new_wl.activeNow; + items[i] = new_wl; + } + if (atom_active) { + dir->activate(pkt->getAddr()); + } + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt_size); +} + +uint32_t +BSPPRWorkload::reduce(uint32_t update, uint32_t value) +{ + float update_float = writeToFloat(update); + float value_float = writeToFloat(value); + return readFromFloat(update_float + value_float); +} + +uint32_t +BSPPRWorkload::propagate(uint32_t value, uint32_t weight) +{ + float value_float = writeToFloat(value); + return readFromFloat(alpha * value_float); +} + +bool +BSPPRWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl) +{ + return (old_wl.degree > 0); +} + +uint32_t +BSPPRWorkload::apply(WorkListItem& wl) +{ + float prop_float = writeToFloat(wl.prop); + float delta = prop_float / wl.degree; + uint32_t delta_uint = readFromFloat(delta); + return delta_uint; +} + +void +BSPPRWorkload::interIterationInit(WorkListItem& wl) +{ + float temp_float = writeToFloat(wl.tempProp); + float prop_float = writeToFloat(wl.prop); + error += std::abs(temp_float - prop_float); + wl.prop = wl.tempProp; + wl.tempProp = readFromFloat((1 - alpha) / numNodes); + wl.activeFuture = (wl.degree > 0); +} + +std::string +BSPPRWorkload::printWorkListItem(const WorkListItem wl) +{ + float temp_float = writeToFloat(wl.tempProp); + float prop_float = writeToFloat(wl.prop); + return csprintf( + "WorkListItem{tempProp: %f, prop: %f, degree: %u, " + "edgeIndex: %u, activeNow: %s, activeFuture: %s}", + temp_float, prop_float, wl.degree, wl.edgeIndex, + wl.activeNow ? "true" : "false", + wl.activeFuture ? "true" : "false"); +} + +void +BSPBCWorkload::init(PacketPtr pkt, WorkDirectory* dir) +{ + int pkt_size = pkt->getSize(); + int aligned_addr = roundDown(initAddr, pkt_size); + + if (aligned_addr == pkt->getAddr()) { + int num_elements = pkt_size / sizeof(WorkListItem); + WorkListItem items[num_elements]; + pkt->writeDataToBlock((uint8_t*) items, pkt_size); + int index = (initAddr - aligned_addr) / sizeof(WorkListItem); + WorkListItem new_wl = items[index]; + uint32_t prop = 0; + prop |= initValue; + // NOTE: Depth of the initial vertex is 0. + prop &= countMask; + new_wl.tempProp = prop; + new_wl.prop = prop; + if (activeCondition(new_wl, items[index])) { + new_wl.activeNow = true; + dir->activate(aligned_addr); + } + items[index] = new_wl; + + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt_size); + } +} + +uint32_t +BSPBCWorkload::reduce(uint32_t update, uint32_t value) +{ + uint32_t update_depth = (update & depthMask) >> 24; + uint32_t update_count = (update & countMask); + uint32_t value_depth = (value & depthMask) >> 24; + uint32_t value_count = (value & countMask); + if (value_depth == 255) { + value_depth = currentDepth; + value_count = 0; + } + if (value_depth == currentDepth) { + value_count += update_count; + } + uint32_t ret = 0; + ret |= value_count; + warn_if(value_count > 16777215, "value count has grown bigger than 16777125." + " This means the algorithm result might not be correct." + " However, the traversal will not be affected." + " Therefore, performane metrics could be used."); + // HACK: Make sure to always set the depth correctly even if count + // exceeds the 2^24-1 limit. Here we reset the depth section of ret. + ret &= countMask; + // NOTE: Now that the depth is securely reset we can copy the correct value. + ret |= (value_depth << 24); + return ret; +} + +uint32_t +BSPBCWorkload::propagate(uint32_t value, uint32_t weight) +{ + return value; +} + +uint32_t +BSPBCWorkload::apply(WorkListItem& wl) +{ + return wl.prop; +} + +void +BSPBCWorkload::interIterationInit(WorkListItem& wl) +{ + wl.prop = wl.tempProp; +} + +bool +BSPBCWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl) +{ + uint32_t depth = (new_wl.tempProp & depthMask) >> 24; + return (depth == currentDepth) && (new_wl.degree > 0); +} + +std::string +BSPBCWorkload::printWorkListItem(WorkListItem wl) +{ + uint32_t temp_depth = (wl.tempProp & depthMask) >> 24; + uint32_t temp_count = (wl.tempProp & countMask); + uint32_t depth = (wl.prop & depthMask) >> 24; + uint32_t count = (wl.prop & countMask); + return csprintf( + "WorkListItem{tempProp: (depth: %d, count: %d), " + "prop: (depth: %d, count: %d), degree: %u, " + "edgeIndex: %u, activeNow: %s, activeFuture: %s}", + temp_depth, temp_count, depth, count, wl.degree, wl.edgeIndex, + wl.activeNow ? "true" : "false", + wl.activeFuture ? "true" : "false"); +} + +} // namespace gem5 diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh new file mode 100644 index 0000000000..72748502c1 --- /dev/null +++ b/src/accl/graph/base/graph_workload.hh @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__ +#define __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__ + +#include +#include +#include + +#include "accl/graph/base/data_structs.hh" +#include "accl/graph/sega/work_directory.hh" +#include "mem/packet.hh" + + +namespace gem5 +{ + +class GraphWorkload +{ + public: + GraphWorkload() {} + ~GraphWorkload() {} + + virtual void init(PacketPtr pkt, WorkDirectory* dir) = 0; + virtual uint32_t reduce(uint32_t update, uint32_t value) = 0; + virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0; + virtual uint32_t apply(WorkListItem& wl) = 0; + virtual void iterate() = 0; + virtual void interIterationInit(WorkListItem& wl) = 0; + virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl) = 0; + virtual std::string printWorkListItem(const WorkListItem wl) = 0; +}; + +class BFSWorkload : public GraphWorkload +{ + private: + uint64_t initAddr; + uint32_t initValue; + + public: + BFSWorkload(uint64_t init_addr, uint32_t init_value): + initAddr(init_addr), initValue(init_value) + {} + + ~BFSWorkload() {} + + virtual void init(PacketPtr pkt, WorkDirectory* dir); + virtual uint32_t reduce(uint32_t update, uint32_t value); + virtual uint32_t propagate(uint32_t value, uint32_t weight); + virtual uint32_t apply(WorkListItem& wl); + virtual void iterate() {} + virtual void interIterationInit(WorkListItem& wl) {} + virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); + virtual std::string printWorkListItem(const WorkListItem wl); +}; + +class BFSVisitedWorkload : public BFSWorkload +{ + public: + BFSVisitedWorkload(Addr init_addr, uint32_t init_value): + BFSWorkload(init_addr, init_value) + {} + virtual uint32_t propagate(uint32_t value, uint32_t weight) override; +}; + +class CCWorkload : public BFSVisitedWorkload +{ + public: + CCWorkload(): BFSVisitedWorkload(0, 0) {} + virtual void init(PacketPtr pkt, WorkDirectory* dir); +}; + +class SSSPWorkload : public BFSWorkload +{ + public: + SSSPWorkload(Addr init_addr, uint32_t init_value): + BFSWorkload(init_addr, init_value) + {} + virtual uint32_t propagate(uint32_t value, uint32_t weight) override; +}; + +class PRWorkload : public GraphWorkload +{ + private: + float alpha; + float threshold; + + public: + PRWorkload(float alpha, float threshold): + alpha(alpha), threshold(threshold) + {} + + ~PRWorkload() {} + + virtual void init(PacketPtr pkt, WorkDirectory* dir); + virtual uint32_t reduce(uint32_t update, uint32_t value); + virtual uint32_t propagate(uint32_t value, uint32_t weight); + virtual uint32_t apply(WorkListItem& wl); + virtual void iterate() {} + virtual void interIterationInit(WorkListItem& wl) {}; + virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); + virtual std::string printWorkListItem(const WorkListItem wl); +}; + +class BSPPRWorkload : public GraphWorkload +{ + private: + int numNodes; + float alpha; + float prevError; + float error; + + public: + BSPPRWorkload(int num_nodes, float alpha): + numNodes(num_nodes), alpha(alpha), prevError(0), error(0) + {} + + ~BSPPRWorkload() {} + + virtual void init(PacketPtr pkt, WorkDirectory* dir); + virtual uint32_t reduce(uint32_t update, uint32_t value); + virtual uint32_t propagate(uint32_t value, uint32_t weight); + virtual uint32_t apply(WorkListItem& wl); + virtual void iterate() { prevError = error; error = 0; } + virtual void interIterationInit(WorkListItem& wl); + virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); + virtual std::string printWorkListItem(const WorkListItem wl); + + float getError() { return prevError; } +}; + +class BSPBCWorkload : public GraphWorkload +{ + private: + Addr initAddr; + uint32_t initValue; + + int currentDepth; + + uint32_t depthMask; + uint32_t countMask; + public: + BSPBCWorkload(Addr init_addr, uint32_t init_value): + initAddr(init_addr), initValue(init_value), + currentDepth(0), depthMask(4278190080), countMask(16777215) + {} + + ~BSPBCWorkload() {} + + virtual void init(PacketPtr pkt, WorkDirectory* dir); + virtual uint32_t reduce(uint32_t update, uint32_t value); + virtual uint32_t propagate(uint32_t value, uint32_t weight); + virtual uint32_t apply(WorkListItem& wl); + virtual void iterate() { currentDepth++; } + virtual void interIterationInit(WorkListItem& wl); + virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); + virtual std::string printWorkListItem(const WorkListItem wl); +}; + +} + +#endif // __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__ diff --git a/src/accl/graph/sega/BaseMemoryEngine.py b/src/accl/graph/sega/BaseMemoryEngine.py new file mode 100644 index 0000000000..10d8b708f0 --- /dev/null +++ b/src/accl/graph/sega/BaseMemoryEngine.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.ClockedObject import ClockedObject + +class BaseMemoryEngine(ClockedObject): + abstract = True + type = 'BaseMemoryEngine' + cxx_header = "accl/graph/sega/base_memory_engine.hh" + cxx_class = 'gem5::BaseMemoryEngine' + + system = Param.System(Parent.any, 'System this Engine is a part of') + mem_port = RequestPort("Port to communicate with the memory") + + attached_memory_atom_size = Param.Int(64, "The atom size of the attached " + "memory.") diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py new file mode 100644 index 0000000000..c5f44c82e9 --- /dev/null +++ b/src/accl/graph/sega/CenteralController.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.util.pybind import PyBindMethod +from m5.objects.ClockedObject import ClockedObject + +class CenteralController(ClockedObject): + type = 'CenteralController' + cxx_header = "accl/graph/sega/centeral_controller.hh" + cxx_class = 'gem5::CenteralController' + + system = Param.System(Parent.any, "System this Engine is a part of") + + vertex_image_file = Param.String("Path to the vertex image file.") + + mpu_vector = VectorParam.MPU("All mpus in the system.") + + cxx_exports = [ + PyBindMethod("setAsyncMode"), + PyBindMethod("setBSPMode"), + PyBindMethod("createPopCountDirectory"), + PyBindMethod("createBFSWorkload"), + PyBindMethod("createBFSVisitedWorkload"), + PyBindMethod("createSSSPWorkload"), + PyBindMethod("createCCWorkload"), + PyBindMethod("createAsyncPRWorkload"), + PyBindMethod("createPRWorkload"), + PyBindMethod("createBCWorkload"), + PyBindMethod("workCount"), + PyBindMethod("getPRError"), + PyBindMethod("printAnswerToHostSimout") + ] diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py new file mode 100644 index 0000000000..25f8a1c58b --- /dev/null +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.BaseMemoryEngine import BaseMemoryEngine + +class CoalesceEngine(BaseMemoryEngine): + type = 'CoalesceEngine' + cxx_header = "accl/graph/sega/coalesce_engine.hh" + cxx_class = 'gem5::CoalesceEngine' + + cache_size = Param.MemorySize("Size of the internal SRAM array.") + + max_resp_per_cycle = Param.Int("Maximum number of vertices to send to " + "requestor in each cycle. Used to limit b/w.") + pending_pull_limit = Param.Int("Maximum number of pending pull processes.") + active_buffer_size = Param.Int("Maximum number of memory active memory " + "atoms ready to send updates. This parameter " + "and post_push_wb_queue_size should be set " + "in tandem. Probably, they should be equal.") + post_push_wb_queue_size = Param.Int("Maximum number of pending wb after " + "apply process for applications that require " + "the apply process to happen exactly before " + "pushing the edgePointer to the PushEngine.") diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py new file mode 100644 index 0000000000..79fa7db8d0 --- /dev/null +++ b/src/accl/graph/sega/MPU.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +# from m5.SimObject import SimObject +from m5.objects.ClockedObject import ClockedObject + +class MPU(ClockedObject): + type = "MPU" + cxx_header = "accl/graph/sega/mpu.hh" + cxx_class = "gem5::MPU" + + system = Param.System(Parent.any, "System this MPU is a part of") + + wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of " + "MPU object.") + coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for " + "each instance of MPU object.") + push_engine = Param.PushEngine(NULL, "Internal PushEngine for each " + "instance of MPU object.") + diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py new file mode 100644 index 0000000000..63fa1eae62 --- /dev/null +++ b/src/accl/graph/sega/PushEngine.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.BaseMemoryEngine import BaseMemoryEngine + +class PushEngine(BaseMemoryEngine): + type = 'PushEngine' + cxx_header = "accl/graph/sega/push_engine.hh" + cxx_class = 'gem5::PushEngine' + + push_req_queue_size = Param.Int("Size of the queue to " + "queue push requests.") + # resp_queue_size should probably be + # significantly bigger than push_req_queue_size + resp_queue_size = Param.Int("Size of the response queue in the " + "push engine where it stores the " + "edges read from memory.") + + max_propagates_per_cycle = Param.Int("Maximum number of propagates " + "done per cycle.") + + update_queue_size = Param.Int("Maximum number of entries " + "for each update queue.") + + out_ports = VectorRequestPort("Outgoing ports to all MPUs") diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript new file mode 100644 index 0000000000..b3e1a838fb --- /dev/null +++ b/src/accl/graph/sega/SConscript @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2016 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Import("*") + +SimObject("BaseMemoryEngine.py", sim_objects=["BaseMemoryEngine"]) +SimObject("CenteralController.py", sim_objects=["CenteralController"]) +SimObject("CoalesceEngine.py", sim_objects=["CoalesceEngine"]) +SimObject("MPU.py", sim_objects=["MPU"]) +SimObject("PushEngine.py", sim_objects=["PushEngine"]) +SimObject("WLEngine.py", sim_objects=["WLEngine"]) + +Source("base_memory_engine.cc") +Source("centeral_controller.cc") +Source("coalesce_engine.cc") +Source("enums.cc") +Source("mpu.cc") +Source("push_engine.cc") +Source("wl_engine.cc") + +DebugFlag("BaseMemoryEngine") +DebugFlag("CenteralController") +DebugFlag("CacheBlockState") +DebugFlag("CoalesceEngine") +DebugFlag("PushEngine") +DebugFlag("SEGAStructureSize") +DebugFlag("MSDebug") +DebugFlag("WLEngine") + +CompoundFlag("MPU", ["CoalesceEngine", "PushEngine", + "WLEngine", "BaseMemoryEngine"]) diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py new file mode 100644 index 0000000000..5a8ed9c9fd --- /dev/null +++ b/src/accl/graph/sega/WLEngine.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.BaseReduceEngine import BaseReduceEngine + +class WLEngine(BaseReduceEngine): + type = 'WLEngine' + cxx_header = "accl/graph/sega/wl_engine.hh" + cxx_class = 'gem5::WLEngine' + + in_ports = VectorResponsePort("Incoming Ports to receive updates from " + "remote outside") + + update_queue_size = Param.Int("Size of the queue WLEngine stores " + "the incoming updates") + + register_file_size = Param.Int("Number of internal registers the " + "WLEngine has. It can service as " + "many updates as this queueu has " + "entries at the same time.") diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc new file mode 100644 index 0000000000..9f704f71e9 --- /dev/null +++ b/src/accl/graph/sega/base_memory_engine.cc @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/base_memory_engine.hh" + +#include "debug/BaseMemoryEngine.hh" +#include "debug/SEGAStructureSize.hh" + +namespace gem5 +{ + +BaseMemoryEngine::BaseMemoryEngine(const BaseMemoryEngineParams ¶ms): + ClockedObject(params), + system(params.system), + _requestorId(system->getRequestorId(this)), + memPort(name() + ".mem_port", this), + peerMemoryAtomSize(params.attached_memory_atom_size) +{} + +BaseMemoryEngine::~BaseMemoryEngine() +{} + +Port& +BaseMemoryEngine::getPort(const std::string &if_name, PortID idx) +{ + if (if_name == "mem_port") { + return memPort; + } else { + return SimObject::getPort(if_name, idx); + } +} + +void +BaseMemoryEngine::init() +{ + AddrRangeList memory_ranges = memPort.getAddrRanges(); + + assert(memory_ranges.size() == 1); + + peerMemoryRange = memory_ranges.front(); + + DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is " + "%s. The range is %s interleaved.\n", __func__, + peerMemoryRange.to_string(), + peerMemoryRange.interleaved() ? "" : "not"); +} + +void +BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt) +{ + panic_if(_blocked, "Should never try to send if blocked MemSide!"); + DPRINTF(BaseMemoryEngine, "%s: Sending pakcet: %s to " + "the memory.\n", __func__, pkt->print()); + if (!sendTimingReq(pkt)) + { + blockedPacket = pkt; + _blocked = true; + DPRINTF(BaseMemoryEngine, "%s: MemPort blocked.\n", __func__); + } else { + DPRINTF(BaseMemoryEngine, "%s: Packet sent successfully.\n", __func__); + owner->recvMemRetry(); + } +} + +bool +BaseMemoryEngine::MemPort::recvTimingResp(PacketPtr pkt) +{ + return owner->handleMemResp(pkt); +} + +void +BaseMemoryEngine::MemPort::recvReqRetry() +{ + panic_if(!(_blocked && blockedPacket), + "Received retry without a blockedPacket"); + + _blocked = false; + PacketPtr pkt = blockedPacket; + blockedPacket = nullptr; + sendPacket(pkt); +} + +PacketPtr +BaseMemoryEngine::createReadPacket(Addr addr, unsigned int size) +{ + RequestPtr req = std::make_shared(addr, size, 0, _requestorId); + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr) _requestorId) << 2); + + // Embed it in a packet + PacketPtr pkt = new Packet(req, MemCmd::ReadReq); + pkt->allocate(); + + return pkt; +} + +PacketPtr +BaseMemoryEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data) +{ + RequestPtr req = std::make_shared(addr, size, 0, _requestorId); + + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr) _requestorId) << 2); + + PacketPtr pkt = new Packet(req, MemCmd::WriteReq); + pkt->allocate(); + pkt->setData(data); + + return pkt; +} + +} diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh new file mode 100644 index 0000000000..afe7fd0433 --- /dev/null +++ b/src/accl/graph/sega/base_memory_engine.hh @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__ +#define __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__ + +#include + +#include "base/addr_range.hh" +#include "mem/packet.hh" +#include "mem/port.hh" +#include "params/BaseMemoryEngine.hh" +#include "sim/clocked_object.hh" +#include "sim/system.hh" + +namespace gem5 +{ + +class BaseMemoryEngine : public ClockedObject +{ + protected: + class MemoryEvent : public EventFunctionWrapper + { + private: + bool _pending; + int _prevState; + + public: + MemoryEvent(const std::function &callback, + const std::string &name): + EventFunctionWrapper(callback, name), + _pending(false), _prevState(0) + {} + bool pending() { return _pending; } + void sleep() { _pending = true; } + void wake() { _pending = false; } + void setPrevState(int state) { _prevState = state; } + int getPrevState() { return _prevState; } + }; + + class MemPort : public RequestPort + { + private: + BaseMemoryEngine* owner; + bool _blocked; + PacketPtr blockedPacket; + + public: + MemPort(const std::string& name, BaseMemoryEngine* owner): + RequestPort(name, owner), owner(owner), + _blocked(false), blockedPacket(nullptr) + {} + + void sendPacket(PacketPtr pkt); + bool blocked() { return _blocked; } + + protected: + virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); + }; + + System* system; + const RequestorID _requestorId; + + MemPort memPort; + AddrRange peerMemoryRange; + size_t peerMemoryAtomSize; + + virtual void recvMemRetry() = 0; + virtual bool handleMemResp(PacketPtr pkt) = 0; + + PacketPtr createReadPacket(Addr addr, unsigned int size); + PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data); + + public: + PARAMS(BaseMemoryEngine); + + BaseMemoryEngine(const Params ¶ms); + ~BaseMemoryEngine(); + + Port& getPort(const std::string &if_name, + PortID idx=InvalidPortID) override; + + AddrRangeList getAddrRanges() { return memPort.getAddrRanges(); } + + virtual void recvFunctional(PacketPtr pkt) = 0; + + virtual void init() override; +}; + +} + +#endif // __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__ diff --git a/src/accl/graph/sega/busyMaskErr b/src/accl/graph/sega/busyMaskErr new file mode 100644 index 0000000000..316fcd37d9 --- /dev/null +++ b/src/accl/graph/sega/busyMaskErr @@ -0,0 +1,16 @@ +gem5/build/NULL/gem5.opt -re --outdir=debug --debug-flags=CacheBlockState gem5/configs/accl/sega.py 1 1KiB /home/fariborz/SEGA/graphs/test/scale_21/binaries/mpu_1/ 0 0 + +32964143000: system.gpts.coalesce_engine: handleMemResp: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}. +32964143000: system.gpts.coalesce_engine: handleMemResp: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 2, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}. +32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 2, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}. +32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964145000}. +32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: true, pendingWB: false, lastChangedTick: 32964145000}. +32964146000: system.gpts.coalesce_engine: processNextApplyEvent: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: true, pendingWB: false, lastChangedTick: 32964145000}. +32964146000: system.gpts.coalesce_engine: processNextApplyEvent: cacheBlock[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: true, lastChangedTick: 32964146000}. +32964146000: system.gpts.coalesce_engine: recvWLRead: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: true, lastChangedTick: 32964146000}. +32964146000: system.gpts.coalesce_engine: recvWLRead: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 1, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964146000}. +32964147000: system.gpts.coalesce_engine: processNextWriteBack: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 1, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964146000}. + +// This assertion would be hit although it should not. +// It is fixed by a hack in recvWLRead when hit in the cache. +assert(cacheBlocks[block_index].busyMask == 0); diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc new file mode 100644 index 0000000000..fc4bacd414 --- /dev/null +++ b/src/accl/graph/sega/centeral_controller.cc @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/centeral_controller.hh" + +#include + +#include "base/cprintf.hh" +#include "base/loader/memory_image.hh" +#include "base/loader/object_file.hh" +#include "debug/CenteralController.hh" +#include "mem/packet_access.hh" +#include "sim/sim_exit.hh" + +namespace gem5 +{ + +CenteralController::CenteralController(const Params& params): + ClockedObject(params), + system(params.system), + mode(ProcessingMode::NOT_SET) +{ + for (auto mpu : params.mpu_vector) { + mpuVector.push_back(mpu); + mpu->registerCenteralController(this); + } +} + +void +CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value) +{ + workload = new BFSWorkload(init_addr, init_value); +} + +void +CenteralController::createBFSVisitedWorkload(Addr init_addr, uint32_t init_value) +{ + workload = new BFSVisitedWorkload(init_addr, init_value); +} + +void +CenteralController::createSSSPWorkload(Addr init_addr, uint32_t init_value) +{ + workload = new SSSPWorkload(init_addr, init_value); +} + +void +CenteralController::createCCWorkload() +{ + workload = new CCWorkload(); +} + +void +CenteralController::createAsyncPRWorkload(float alpha, float threshold) +{ + workload = new PRWorkload(alpha, threshold); +} + +void +CenteralController::createPRWorkload(int num_nodes, float alpha) +{ + workload = new BSPPRWorkload(num_nodes, alpha); +} + +void +CenteralController::createBCWorkload(Addr init_addr, uint32_t init_value) +{ + workload = new BSPBCWorkload(init_addr, init_value); +} + +bool +CenteralController::bufferRemoteUpdate(int slice_number, PacketPtr pkt) +{ + for (auto mpu: mpuVector) { + if (contains(mpu->getAddrRanges(), pkt->getAddr())) { + remoteUpdates[mpu][slice_number].push_back(pkt); + } + } + + return true; +} + +void +CenteralController::createPopCountDirectory(int atoms_per_block) +{ + fatal_if(mode == ProcessingMode::NOT_SET, "You should set the processing " + "mode by calling either setAsyncMode or setBSPMode."); + if (mode == ProcessingMode::ASYNCHRONOUS) { + for (auto mpu: mpuVector) { + mpu->createAsyncPopCountDirectory(atoms_per_block); + } + } + if (mode == ProcessingMode::BULK_SYNCHRONOUS) { + for (auto mpu: mpuVector) { + mpu->createBSPPopCountDirectory(atoms_per_block); + } + } +} + +void +CenteralController::startup() +{ + unsigned int vertex_atom = mpuVector.front()->vertexAtomSize(); + for (auto mpu: mpuVector) { + addrRangeListMap[mpu] = mpu->getAddrRanges(); + mpu->setProcessingMode(mode); + mpu->recvWorkload(workload); + } + + const auto& vertex_file = params().vertex_image_file; + if (vertex_file == "") + return; + + auto* object = loader::createObjectFile(vertex_file, true); + fatal_if(!object, "%s: Could not load %s.", name(), vertex_file); + + loader::debugSymbolTable.insert(*object->symtab().globals()); + loader::MemoryImage vertex_image = object->buildImage(); + maxVertexAddr = vertex_image.maxAddr(); + + PortProxy vertex_proxy( + [this](PacketPtr pkt) { + for (auto mpu: mpuVector) { + AddrRangeList range_list = addrRangeListMap[mpu]; + if (contains(range_list, pkt->getAddr())) { + mpu->recvFunctional(pkt); + } + } + }, vertex_atom); + + panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image."); + + for (auto mpu: mpuVector) { + mpu->postMemInitSetup(); + if (!mpu->running() && (mpu->workCount() > 0)) { + mpu->start(); + } + } + workload->iterate(); +} + +PacketPtr +CenteralController::createReadPacket(Addr addr, unsigned int size) +{ + RequestPtr req = std::make_shared(addr, size, 0, 0); + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr) 0) << 2); + + // Embed it in a packet + PacketPtr pkt = new Packet(req, MemCmd::ReadReq); + pkt->allocate(); + + return pkt; +} + +void +CenteralController::recvDoneSignal() +{ + bool done = true; + for (auto mpu : mpuVector) { + done &= mpu->done(); + int total_num_slices = remoteUpdates[mpu].size(); + if (mpu->done()) { + int slice_number = mpu->getSliceCounter() + 1; + while ((total_num_slices != 0) && (slice_number != mpu->getSliceCounter())) { + if (!remoteUpdates[mpu][slice_number].empty()) { + mpu->scheduleNewSlice(); + mpu->updateSliceCounter(slice_number); + done = false; + break; + } + else { + if (slice_number == total_num_slices) { + slice_number = 0; + } else { + slice_number++; + } + } + } + } + } + + if (done && mode == ProcessingMode::ASYNCHRONOUS) { + exitSimLoopNow("no update left to process."); + } + + if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) { + for (auto mpu: mpuVector) { + mpu->postConsumeProcess(); + mpu->swapDirectories(); + if (!mpu->running() && (mpu->workCount() > 0)) { + mpu->start(); + } + } + workload->iterate(); + exitSimLoopNow("finished an iteration."); + } +} + +int +CenteralController::workCount() +{ + int work_count = 0; + for (auto mpu: mpuVector) { + work_count += mpu->workCount(); + } + return work_count; +} + +float +CenteralController::getPRError() +{ + BSPPRWorkload* pr_workload = dynamic_cast(workload); + return pr_workload->getError(); +} + +void +CenteralController::printAnswerToHostSimout() +{ + unsigned int vertex_atom = mpuVector.front()->vertexAtomSize(); + int num_items = vertex_atom / sizeof(WorkListItem); + WorkListItem items[num_items]; + for (Addr addr = 0; addr < maxVertexAddr; addr += vertex_atom) + { + PacketPtr pkt = createReadPacket(addr, vertex_atom); + for (auto mpu: mpuVector) { + AddrRangeList range_list = addrRangeListMap[mpu]; + if (contains(range_list, addr)) { + mpu->recvFunctional(pkt); + } + } + pkt->writeDataToBlock((uint8_t*) items, vertex_atom); + for (int i = 0; i < num_items; i++) { + std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i, + workload->printWorkListItem(items[i])); + + std::cout << print << std::endl; + } + } +} + +} diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh new file mode 100644 index 0000000000..6692d999ed --- /dev/null +++ b/src/accl/graph/sega/centeral_controller.hh @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__ +#define __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__ + +#include + +#include "accl/graph/base/data_structs.hh" +#include "accl/graph/base/graph_workload.hh" +#include "accl/graph/sega/enums.hh" +#include "accl/graph/sega/mpu.hh" +#include "base/addr_range.hh" +#include "params/CenteralController.hh" +#include "sim/clocked_object.hh" +#include "sim/system.hh" + +namespace gem5 +{ + +class CenteralController : public ClockedObject +{ + private: + System* system; + Addr maxVertexAddr; + + ProcessingMode mode; + + std::vector mpuVector; + std::unordered_map addrRangeListMap; + + PacketPtr createReadPacket(Addr addr, unsigned int size); + + public: + + GraphWorkload* workload; + + PARAMS(CenteralController); + CenteralController(const CenteralControllerParams ¶ms); + virtual void startup() override; + + void setAsyncMode() { mode = ProcessingMode::ASYNCHRONOUS; } + void setBSPMode() { mode = ProcessingMode::BULK_SYNCHRONOUS; } + + void createPopCountDirectory(int atoms_per_block); + + void createBFSWorkload(Addr init_addr, uint32_t init_value); + void createBFSVisitedWorkload(Addr init_addr, uint32_t init_value); + void createSSSPWorkload(Addr init_addr, uint32_t init_value); + void createCCWorkload(); + void createAsyncPRWorkload(float alpha, float threshold); + void createPRWorkload(int num_nodes, float alpha); + void createBCWorkload(Addr init_addr, uint32_t init_value); + + bool bufferRemoteUpdate(int slice_number, PacketPtr pkt); + int getnumGPTs() {return mpuVector.size();} + + void recvDoneSignal(); + + int workCount(); + float getPRError(); + void printAnswerToHostSimout(); + std::unordered_map>> + remoteUpdates; +}; + +} + +#endif // __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__ diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc new file mode 100644 index 0000000000..8c38341f48 --- /dev/null +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -0,0 +1,1275 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/coalesce_engine.hh" + +#include + +#include "accl/graph/sega/mpu.hh" +#include "base/intmath.hh" +#include "debug/CacheBlockState.hh" +#include "debug/CoalesceEngine.hh" +#include "debug/SEGAStructureSize.hh" +#include "mem/packet_access.hh" +#include "sim/sim_exit.hh" + +namespace gem5 +{ + +CoalesceEngine::CoalesceEngine(const Params ¶ms): + BaseMemoryEngine(params), mode(ProcessingMode::NOT_SET), lastAtomAddr(0), + numLines((int) (params.cache_size / peerMemoryAtomSize)), + numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), + onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle), + pullsReceived(0), pullsScheduled(0), + pendingPullLimit(params.pending_pull_limit), + pendingPullReads(0), activeBufferSize(params.active_buffer_size), + postPushWBQueueSize(params.post_push_wb_queue_size), + nextMemoryEvent([this] { + processNextMemoryEvent(); + }, name() + ".nextMemoryEvent"), + nextResponseEvent([this] { + processNextResponseEvent(); + }, name() + ".nextResponseEvent"), + nextApplyEvent([this] { + processNextApplyEvent(); + }, name() + ".nextApplyEvent"), + nextDoneSignalEvent([this] { + processNextDoneSignalEvent(); + }, name() + ".nextDoneSignalEvent"), + stats(*this) +{ + assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine)); + cacheBlocks = new Block [numLines]; + for (int i = 0; i < numLines; i++) { + cacheBlocks[i] = Block(numElementsPerLine); + } + currentActiveCacheBlocks = UniqueFIFO(numLines); + futureActiveCacheBlocks = UniqueFIFO(numLines); + + activeBuffer.clear(); + postPushWBQueue.clear(); +} + +void +CoalesceEngine::registerMPU(MPU* mpu) +{ + owner = mpu; +} + + +// NOTE: Used for initializing memory and reading the final answer +void +CoalesceEngine::recvFunctional(PacketPtr pkt) +{ + if (pkt->isRead()) { + assert(pkt->getSize() == peerMemoryAtomSize); + Addr addr = pkt->getAddr(); + int block_index = getBlockIndex(addr); + + if ((cacheBlocks[block_index].addr == addr) && + (cacheBlocks[block_index].valid)) { + assert(cacheBlocks[block_index].state == CacheState::IDLE); + + pkt->makeResponse(); + pkt->setDataFromBlock( + (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); + } else { + memPort.sendFunctional(pkt); + } + } else { + graphWorkload->init(pkt, currentDirectory); + if (pkt->getAddr() > lastAtomAddr) { + lastAtomAddr = pkt->getAddr(); + } + memPort.sendFunctional(pkt); + } +} + +void +CoalesceEngine::postMemInitSetup() +{ + currentDirectory->setLastAtomAddr(lastAtomAddr); +} + +void +CoalesceEngine::postConsumeProcess() +{ + Addr last_local_atom_addr = peerMemoryRange.removeIntlvBits(lastAtomAddr); + for (Addr local_addr = 0; local_addr <= last_local_atom_addr; local_addr += peerMemoryAtomSize) { + Addr addr = peerMemoryRange.addIntlvBits(local_addr); + int block_index = getBlockIndex(addr); + if (cacheBlocks[block_index].addr == addr) { + assert(cacheBlocks[block_index].valid); + assert(!cacheBlocks[block_index].hasConflict); + assert(cacheBlocks[block_index].state == CacheState::IDLE); + bool atom_active_future_before = false; + bool atom_active_future_after = false; + for (int index = 0; index < numElementsPerLine; index++) { + assert(!cacheBlocks[block_index].items[index].activeNow); + atom_active_future_before |= cacheBlocks[block_index].items[index].activeFuture; + graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]); + atom_active_future_after |= cacheBlocks[block_index].items[index].activeFuture; + if (cacheBlocks[block_index].items[index].activeFuture) { + cacheBlocks[block_index].items[index].activeFuture = false; + cacheBlocks[block_index].items[index].activeNow = true; + cacheBlocks[block_index].dirty = true; + } + } + if (!atom_active_future_before && atom_active_future_after) { + futureActiveCacheBlocks.push_back(block_index); + } + if (atom_active_future_before && !atom_active_future_after) { + futureActiveCacheBlocks.erase(block_index); + } + } else { + WorkListItem items[numElementsPerLine]; + PacketPtr read_pkt = createReadPacket(addr, peerMemoryAtomSize); + memPort.sendFunctional(read_pkt); + read_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + bool atom_active_future_before = false; + bool atom_active_future_after = false; + for (int index = 0; index < numElementsPerLine; index++) { + assert(!items[index].activeNow); + atom_active_future_before |= items[index].activeFuture; + graphWorkload->interIterationInit(items[index]); + atom_active_future_after |= items[index].activeFuture; + if (items[index].activeFuture) { + items[index].activeFuture = false; + items[index].activeNow = true; + } + } + if (!atom_active_future_before && atom_active_future_after) { + futureDirectory->activate(addr); + } + if (atom_active_future_before && !atom_active_future_after) { + futureDirectory->deactivate(addr); + } + PacketPtr write_pkt = createWritePacket(addr, peerMemoryAtomSize, (uint8_t*) items); + memPort.sendFunctional(write_pkt); + delete read_pkt; + delete write_pkt; + } + } +} + +void +CoalesceEngine::createAsyncPopCountDirectory(int atoms_per_block) +{ + currentDirectory = new PopCountDirectory( + peerMemoryRange, atoms_per_block, peerMemoryAtomSize); + futureDirectory = nullptr; +} + +void +CoalesceEngine::createBSPPopCountDirectory(int atoms_per_block) +{ + currentDirectory = new PopCountDirectory( + peerMemoryRange, atoms_per_block, peerMemoryAtomSize); + futureDirectory = new PopCountDirectory( + peerMemoryRange, atoms_per_block, peerMemoryAtomSize); +} + +void +CoalesceEngine::swapDirectories() +{ + assert(currentDirectory->empty()); + assert(currentActiveCacheBlocks.empty()); + // assert currentDirectory is empty + WorkDirectory* temp = currentDirectory; + currentDirectory = futureDirectory; + futureDirectory = temp; + + currentActiveCacheBlocks.clear(); + currentActiveCacheBlocks = futureActiveCacheBlocks; + futureActiveCacheBlocks.clear(); +} + +bool +CoalesceEngine::done() +{ + return memoryFunctionQueue.empty() && currentActiveCacheBlocks.empty() && + activeBuffer.empty() && currentDirectory->empty() && (onTheFlyReqs == 0); +} + +bool +CoalesceEngine::enoughSpace() +{ + return (activeBuffer.size() + pendingPullReads + pullsScheduled) < activeBufferSize; +} + +bool +CoalesceEngine::pullCondition() +{ + bool enough_space = enoughSpace(); + bool schedule_limit = pullsScheduled < pendingPullLimit; + return enough_space && schedule_limit; +} + +// addr should be aligned to peerMemoryAtomSize +int +CoalesceEngine::getBlockIndex(Addr addr) +{ + assert((addr % peerMemoryAtomSize) == 0); + Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr); + return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines; +} + +ReadReturnStatus +CoalesceEngine::recvWLRead(Addr addr) +{ + Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); + assert(aligned_addr % peerMemoryAtomSize == 0); + int block_index = getBlockIndex(aligned_addr); + assert(block_index < numLines); + int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); + assert(wl_offset < numElementsPerLine); + DPRINTF(CoalesceEngine, "%s: Received a read request for addr: %lu. " + "This request maps to cacheBlocks[%d], aligned_addr: " + "%lu, and wl_offset: %d.\n", __func__, addr, + block_index, aligned_addr, wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + if ((cacheBlocks[block_index].addr == aligned_addr) && + (cacheBlocks[block_index].valid)) { + // Hit + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit.\n", __func__, addr); + stats.readHits++; + assert(cacheBlocks[block_index].state != CacheState::INVALID); + responseQueue.push_back(std::make_tuple( + addr, cacheBlocks[block_index].items[wl_offset], curTick())); + + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + // TODO: Stat to count the number of WLItems that have been touched. + cacheBlocks[block_index].busyMask |= (1 << wl_offset); + cacheBlocks[block_index].state = CacheState::BUSY; + // HACK: If a read happens on the same cycle as another operation such + // as apply set lastChangedTick to half a cycle later so that operation + // scheduled by the original operation (apply in this example) are + // invalidated. For more details refer to "accl/graph/sega/busyMaskErr" + cacheBlocks[block_index].lastChangedTick = + curTick() + (Tick) (clockPeriod() / 2); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + if (!nextResponseEvent.scheduled()) { + schedule(nextResponseEvent, nextCycle()); + } + stats.numVertexReads++; + return ReadReturnStatus::ACCEPT; + } else if ((cacheBlocks[block_index].addr == aligned_addr) && + (cacheBlocks[block_index].state == CacheState::PENDING_DATA)) { + // Hit under miss + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit under miss.\n", + __func__, addr); + stats.readHitUnderMisses++; + assert(!cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].dirty); + + assert(MSHR.find(block_index) != MSHR.end()); + MSHR[block_index].push_back(addr); + DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to MSHR " + "for cacheBlocks[%d].\n", __func__, addr, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + stats.numVertexReads++; + return ReadReturnStatus::ACCEPT; + } else { + // miss + assert(cacheBlocks[block_index].addr != aligned_addr); + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a miss.\n", __func__, addr); + stats.readMisses++; + if (cacheBlocks[block_index].state != CacheState::INVALID) { + // conflict miss + DPRINTF(CoalesceEngine, "%s: Addr: %lu has conflict with " + "Addr: %lu.\n", __func__, addr, cacheBlocks[block_index].addr); + cacheBlocks[block_index].hasConflict = true; + if (cacheBlocks[block_index].state == CacheState::IDLE) { + if (cacheBlocks[block_index].dirty) { + cacheBlocks[block_index].state = CacheState::PENDING_WB; + cacheBlocks[block_index].lastChangedTick = curTick(); + memoryFunctionQueue.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextWriteBack(block_index, schedule_tick); + }, block_index, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + } else { + // NOTE: The cache block could still be active but + // not dirty. If active we only have to active tracking + // but can throw the data away. + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_now) { + currentActiveCacheBlocks.erase(block_index); + int count = currentDirectory->activate(cacheBlocks[block_index].addr); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.currentBlockActiveCount.sample(count); + } + if (atom_active_future) { + futureActiveCacheBlocks.erase(block_index); + int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); + } + // NOTE: Bring the cache line to invalid state. + // NOTE: Above line where we set hasConflict to true + // does not matter anymore since we reset the cache line. + cacheBlocks[block_index].reset(); + } + return ReadReturnStatus::REJECT_NO_ROLL; + } else { + stats.numConflicts++; + return ReadReturnStatus::REJECT_ROLL; + } + } else { + // cold miss + assert(MSHR.find(block_index) == MSHR.end()); + cacheBlocks[block_index].addr = aligned_addr; + cacheBlocks[block_index].busyMask = 0; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].dirty = false; + cacheBlocks[block_index].hasConflict = false; + cacheBlocks[block_index].state = CacheState::PENDING_DATA; + cacheBlocks[block_index].lastChangedTick = curTick(); + + MSHR[block_index].push_back(addr); + memoryFunctionQueue.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextRead(block_index, schedule_tick); + }, block_index, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + return ReadReturnStatus::ACCEPT; + } + } +} + +bool +CoalesceEngine::handleMemResp(PacketPtr pkt) +{ + assert(pkt->isResponse()); + DPRINTF(CoalesceEngine, "%s: Received packet: %s from memory.\n", + __func__, pkt->print()); + + onTheFlyReqs--; + if (pkt->isWrite()) { + DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__); + delete pkt; + } else { + assert(pkt->isRead()); + Addr addr = pkt->getAddr(); + int block_index = getBlockIndex(addr); + ReadPurpose* purpose = pkt->findNextSenderState(); + // TODO: delete purpose + + // NOTE: Regardless of where the pkt will go we have to release the + // reserved space for this pkt in the activeBuffer in case + // it was read from memory for placement in the activeBuffer. + // NOTE: Also we have to stop tracking the address for pullAddrs + if (purpose->dest() == ReadDestination::READ_FOR_PUSH) { + pendingPullReads--; + pendingPullAddrs.erase(addr); + } + if (cacheBlocks[block_index].addr == addr) { + // If it is in the cache, line should be in PENDING_DATA state. + // Regardless of the purpose for which it was read, it should + // be placed in the cache array. + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].valid); + assert(!cacheBlocks[block_index].dirty); + assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA); + + // NOTE: Since it is in PENDING_DATA state it + // should have an entry in the MSHR. + assert(MSHR.find(block_index) != MSHR.end()); + + pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items, + peerMemoryAtomSize); + + cacheBlocks[block_index].valid = true; + // HACK: In case the pkt was read for push but it was allocated + // for in the cache later on, we should cancel the future + // processNextRead for this block. We could set lastChangedTick + // to curTick() like usual. However, there is no way to ensure + // that processNextRead will be not be called on the same tick + // as the pkt arrives from the memory. Therefore, we will set + // the lastChangedTick to half a cycle before the actual time. + // We move that back in time because it would be fine if + // processNextRead happened before pkt arriveed. processNextRead + // actually will check if there is a pending read for push for + // the address it's trying to populate. + if (purpose->dest() == ReadDestination::READ_FOR_PUSH) { + cacheBlocks[block_index].lastChangedTick = + curTick() - (Tick) (clockPeriod() / 2); + } else { + cacheBlocks[block_index].lastChangedTick = curTick(); + } + + // NOTE: If the atom is active we have to deactivate the tracking + // of this atom in the memory since it's not in memory anymore. + // Since it is going to the cache, cache will be responsible for + // tracking this. Push to activeCacheBlocks for simulator speed + // instead of having to search for active blocks in the cache. + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_now) { + int count = currentDirectory->deactivate(addr); + currentActiveCacheBlocks.push_back(block_index); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.currentBlockActiveCount.sample(count); + } + if (atom_active_future) { + int count = futureDirectory->deactivate(addr); + futureActiveCacheBlocks.push_back(block_index); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); + } + + assert(MSHR.find(block_index) != MSHR.end()); + for (auto it = MSHR[block_index].begin(); + it != MSHR[block_index].end();) { + Addr miss_addr = *it; + Addr aligned_miss_addr = + roundDown(miss_addr, peerMemoryAtomSize); + + assert(aligned_miss_addr == cacheBlocks[block_index].addr); + int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); + DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " + "cacheBlocks[%d] can be serviced with the received " + "packet.\n",__func__, miss_addr, block_index); + responseQueue.push_back(std::make_tuple(miss_addr, + cacheBlocks[block_index].items[wl_offset], curTick())); + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + cacheBlocks[block_index].busyMask |= (1 << wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + it = MSHR[block_index].erase(it); + } + MSHR.erase(block_index); + + cacheBlocks[block_index].state = CacheState::BUSY; + if ((!nextResponseEvent.scheduled()) && (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } + delete pkt; + } else { + assert(purpose->dest() == ReadDestination::READ_FOR_PUSH); + // There should be enough room in activeBuffer to place this pkt. + // REMEMBER: If dest == READ_FOR_PUSH we release the reserved space. + // So at this point in code we should have at least one free entry + // in the active buffer which is reserved for this pkt. + assert(activeBuffer.size() + pendingPullReads < activeBufferSize); + + WorkListItem items[numElementsPerLine]; + pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= items[index].activeNow; + atom_active_future |= items[index].activeFuture; + } + if (atom_active_now) { + int count = currentDirectory->deactivate(addr); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.currentBlockActiveCount.sample(count); + if (atom_active_future) { + int count = futureDirectory->deactivate(addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); + } + activeBuffer.emplace_back(pkt, curTick()); + } else { + stats.wastefulBytesRead += pkt->getSize(); + delete pkt; + } + + if (pullCondition()) { + memoryFunctionQueue.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextVertexPull(ignore, schedule_tick); + }, 0, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + pullsScheduled++; + } + } + delete purpose; + } + + if (done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); + } + return true; +} + +void +CoalesceEngine::processNextResponseEvent() +{ + int num_responses_sent = 0; + + Addr addr_response; + WorkListItem worklist_response; + Tick response_queueing_tick; + while(true) { + std::tie(addr_response, worklist_response, response_queueing_tick) = + responseQueue.front(); + Tick waiting_ticks = curTick() - response_queueing_tick; + if (ticksToCycles(waiting_ticks) < 1) { + break; + } + owner->handleIncomingWL(addr_response, worklist_response); + num_responses_sent++; + DPRINTF(CoalesceEngine, + "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n", + __func__, + graphWorkload->printWorkListItem(worklist_response), + addr_response); + + responseQueue.pop_front(); + DPRINTF(SEGAStructureSize, "%s: Popped a response from responseQueue." + " responseQueue.size = %d.\n", __func__, + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Popped a response from responseQueue. " + "responseQueue.size = %d.\n", __func__, + responseQueue.size()); + stats.responseQueueLatency.sample( + waiting_ticks * 1e9 / getClockFrequency()); + if (num_responses_sent >= maxRespPerCycle) { + // TODO: Add the condition to check that front of queue can be + // sent to WLEngine. i.e. it has at least been in the queue for + // one cycle. + if (!responseQueue.empty()) { + stats.responsePortShortage++; + } + break; + } + if (responseQueue.empty()) { + break; + } + } + + if ((!nextResponseEvent.scheduled()) && + (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } +} + +void +CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) +{ + Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); + int block_index = getBlockIndex(aligned_addr); + int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); + DPRINTF(CoalesceEngine, "%s: Received a write request for addr: %lu with " + "wl: %s. This request maps to cacheBlocks[%d], " + "aligned_addr: %lu, and wl_offset: %d.\n", + __func__, addr, graphWorkload->printWorkListItem(wl), + block_index, aligned_addr, wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + DPRINTF(CoalesceEngine, "%s: Received a write for WorkListItem: %s " + "with Addr: %lu.\n", __func__, + graphWorkload->printWorkListItem(wl), addr); + + // NOTE: Design does not allow for write misses. + assert(cacheBlocks[block_index].addr == aligned_addr); + // cache state asserts + assert(cacheBlocks[block_index].busyMask != 0); + assert(cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].state == CacheState::BUSY); + + // respective bit in busyMask for wl is set. + assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) == + (1 << wl_offset)); + + if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) { + cacheBlocks[block_index].dirty |= true; + } + + bool active = graphWorkload->activeCondition(wl, cacheBlocks[block_index].items[wl_offset]); + cacheBlocks[block_index].items[wl_offset] = wl; + if (mode == ProcessingMode::ASYNCHRONOUS) { + cacheBlocks[block_index].items[wl_offset].activeNow |= active; + if (active && (!currentActiveCacheBlocks.find(block_index))) { + currentActiveCacheBlocks.push_back(block_index); + if (!owner->running()) { + owner->start(); + } + } + } + if (mode == ProcessingMode::BULK_SYNCHRONOUS) { + cacheBlocks[block_index].items[wl_offset].activeFuture |= active; + if (active && (!futureActiveCacheBlocks.find(block_index))) { + futureActiveCacheBlocks.push_back(block_index); + } + } + + cacheBlocks[block_index].busyMask &= ~(1 << wl_offset); + cacheBlocks[block_index].lastChangedTick = curTick(); + DPRINTF(CoalesceEngine, "%s: Wrote to cacheBlocks[%d][%d] = %s.\n", + __func__, block_index, wl_offset, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset])); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + if (cacheBlocks[block_index].busyMask == 0) { + if (cacheBlocks[block_index].hasConflict) { + if (cacheBlocks[block_index].dirty) { + cacheBlocks[block_index].state = CacheState::PENDING_WB; + cacheBlocks[block_index].lastChangedTick = curTick(); + memoryFunctionQueue.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextWriteBack(block_index, schedule_tick); + }, block_index, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + } else { + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_now) { + currentActiveCacheBlocks.erase(block_index); + int count = currentDirectory->activate(cacheBlocks[block_index].addr); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.currentBlockActiveCount.sample(count); + } + if (atom_active_future) { + futureActiveCacheBlocks.erase(block_index); + int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); + } + cacheBlocks[block_index].reset(); + } + } else { + cacheBlocks[block_index].state = CacheState::IDLE; + cacheBlocks[block_index].lastChangedTick = curTick(); + } + } + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + stats.numVertexWrites++; + + if ((cacheBlocks[block_index].state == CacheState::IDLE) && + done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextMemoryEvent() +{ + if (memPort.blocked()) { + stats.numMemoryBlocks++; + nextMemoryEvent.sleep(); + return; + } + + DPRINTF(CoalesceEngine, "%s: Processing another " + "memory function.\n", __func__); + std::function next_memory_function; + int next_memory_function_input; + Tick next_memory_function_tick; + std::tie( + next_memory_function, + next_memory_function_input, + next_memory_function_tick) = memoryFunctionQueue.front(); + next_memory_function(next_memory_function_input, next_memory_function_tick); + memoryFunctionQueue.pop_front(); + stats.memoryFunctionLatency.sample((curTick() - next_memory_function_tick) + * 1e9 / getClockFrequency()); + DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. " + "memoryFunctionQueue.size = %d.\n", __func__, + memoryFunctionQueue.size()); + + assert(!nextMemoryEvent.pending()); + assert(!nextMemoryEvent.scheduled()); + if ((!memoryFunctionQueue.empty())) { + schedule(nextMemoryEvent, nextCycle()); + } + + if (done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextRead(int block_index, Tick schedule_tick) +{ + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n", + __func__, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", + __func__, block_index, cacheBlocks[block_index].to_string()); + // A cache block should not be touched while it's waiting for data. + // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick); + // TODO: Figure out if this is still necessary. + if (cacheBlocks[block_index].lastChangedTick != schedule_tick) { + return; + } + + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].valid); + assert(!cacheBlocks[block_index].dirty); + assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA); + + bool need_send_pkt = true; + + // NOTE: Search postPushWBQueue + for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end();) + { + PacketPtr wb_pkt = std::get<0>(*wb); + if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) { + wb_pkt->writeDataToBlock( + (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); + cacheBlocks[block_index].valid = true; + cacheBlocks[block_index].dirty = true; + cacheBlocks[block_index].lastChangedTick = curTick(); + // NOTE: If an atom is in the postPushWBQueue, + // the it is definitely currently not active. + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) + { + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_future) { + futureActiveCacheBlocks.push_back(block_index); + } + + need_send_pkt = false; + wb = postPushWBQueue.erase(wb); + delete wb_pkt; + } else { + wb++; + } + } + // NOTE: Search activeBuffer + for (auto ab = activeBuffer.begin(); ab != activeBuffer.end();) { + PacketPtr ab_pkt = std::get<0>(*ab); + if (cacheBlocks[block_index].addr == ab_pkt->getAddr()) { + ab_pkt->writeDataToBlock( + (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); + + cacheBlocks[block_index].valid = true; + cacheBlocks[block_index].dirty = true; + cacheBlocks[block_index].lastChangedTick = curTick(); + // If an atom is in the activeBuffer, + // then it is definitely currently active. + currentActiveCacheBlocks.push_back(block_index); + // NOTE: Residence in the activeBuffer does not + // signify anything about future activity. + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) + { + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_future) { + futureActiveCacheBlocks.push_back(block_index); + } + + need_send_pkt = false; + ab = activeBuffer.erase(ab); + delete ab_pkt; + if (pullCondition()) { + memoryFunctionQueue.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextVertexPull(ignore, schedule_tick); + }, 0, curTick()); + pullsScheduled++; + } + } else { + ab++; + } + } + if (!need_send_pkt) { + for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) { + Addr miss_addr = *it; + Addr aligned_miss_addr = + roundDown(miss_addr, peerMemoryAtomSize); + assert(aligned_miss_addr == cacheBlocks[block_index].addr); + int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); + DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " + "cacheBlocks[%d] can be serviced with the received " + "packet.\n",__func__, miss_addr, block_index); + // TODO: Make this block of code into a function + responseQueue.push_back(std::make_tuple(miss_addr, + cacheBlocks[block_index].items[wl_offset], curTick())); + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + cacheBlocks[block_index].busyMask |= (1 << wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", + __func__, block_index, + cacheBlocks[block_index].to_string()); + it = MSHR[block_index].erase(it); + } + assert(MSHR[block_index].empty()); + MSHR.erase(block_index); + if ((!nextResponseEvent.scheduled()) && + (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } + cacheBlocks[block_index].state = CacheState::BUSY; + } + + if (pendingPullAddrs.find(cacheBlocks[block_index].addr) != + pendingPullAddrs.end()) { + need_send_pkt = false; + } + + if (need_send_pkt) { + PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr, + peerMemoryAtomSize); + ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_CACHE); + pkt->pushSenderState(purpose); + DPRINTF(CoalesceEngine, "%s: Created a read packet. addr = %lu, " + "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); + memPort.sendPacket(pkt); + onTheFlyReqs++; + } +} + +void +CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) +{ + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n", + __func__, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + if (schedule_tick == cacheBlocks[block_index].lastChangedTick) { + assert(cacheBlocks[block_index].busyMask == 0); + assert(cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].dirty); + assert(cacheBlocks[block_index].hasConflict); + assert(cacheBlocks[block_index].state == CacheState::PENDING_WB); + + // NOTE: If the atom we're writing back is active, we have to + // stop tracking it in the cache and start tracking it in the memory. + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + + PacketPtr pkt = createWritePacket( + cacheBlocks[block_index].addr, peerMemoryAtomSize, + (uint8_t*) cacheBlocks[block_index].items); + DPRINTF(CoalesceEngine, "%s: Created a write packet to " + "Addr: %lu, size = %d.\n", __func__, + pkt->getAddr(), pkt->getSize()); + if (atom_active_future) { + futureActiveCacheBlocks.erase(block_index); + } + if (atom_active_now) { + currentActiveCacheBlocks.erase(block_index); + if (enoughSpace()) { + activeBuffer.emplace_back(pkt, curTick()); + } else { + int count = currentDirectory->activate(cacheBlocks[block_index].addr); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.currentBlockActiveCount.sample(count); + if (atom_active_future) { + int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); + } + memPort.sendPacket(pkt); + onTheFlyReqs++; + } + } else { + if (atom_active_future) { + int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); + } + memPort.sendPacket(pkt); + onTheFlyReqs++; + } + cacheBlocks[block_index].reset(); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + } else { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a " + "write back has been scheduled for it. Ignoring " + "the current write back scheduled at tick %lu for " + "the right function scheduled later.\n", + __func__, block_index, schedule_tick); + } +} + +void +CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick) +{ + if (!postPushWBQueue.empty()) { + PacketPtr wb_pkt; + Tick pkt_tick; + std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front(); + if (schedule_tick == pkt_tick) { + WorkListItem items[numElementsPerLine]; + wb_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_future |= items[index].activeFuture; + } + if (atom_active_future) { + futureDirectory->activate(wb_pkt->getAddr()); + } + memPort.sendPacket(wb_pkt); + onTheFlyReqs++; + postPushWBQueue.pop_front(); + } + } +} + +void +CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) +{ + DPRINTF(CoalesceEngine, "%s: processNextVertexPull called.\n", __func__); + pullsScheduled--; + if (!currentDirectory->empty()) { + Addr addr = currentDirectory->getNextWork(); + int block_index = getBlockIndex(addr); + + bool in_cache = cacheBlocks[block_index].addr == addr; + bool in_active_buffer = false; + for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) { + PacketPtr pkt = std::get<0>(*ab); + in_active_buffer |= (pkt->getAddr() == addr); + } + bool in_write_buffer = false; + for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++) + { + PacketPtr pkt = std::get<0>(*wb); + in_write_buffer |= (pkt->getAddr() == addr); + } + bool repeat_work = pendingPullAddrs.find(addr) != pendingPullAddrs.end(); + + if (!in_cache && !in_active_buffer && !in_write_buffer && !repeat_work) { + PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize); + ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_PUSH); + pkt->pushSenderState(purpose); + memPort.sendPacket(pkt); + onTheFlyReqs++; + pendingPullReads++; + pendingPullAddrs.insert(addr); + } + } +} + +void +CoalesceEngine::recvMemRetry() +{ + DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__); + + if (!nextMemoryEvent.pending()) { + DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__); + return; + } + assert(!nextMemoryEvent.scheduled()); + nextMemoryEvent.wake(); + schedule(nextMemoryEvent, nextCycle()); +} + +int +CoalesceEngine::workCount() +{ + return currentActiveCacheBlocks.size() + currentDirectory->workCount() + activeBuffer.size(); +} + +void +CoalesceEngine::recvVertexPull() +{ + pullsReceived++; + DPRINTF(CoalesceEngine, "%s: Received a vertex pull. pullsReceived: %d.\n", __func__, pullsReceived); + + stats.verticesPulled++; + stats.lastVertexPullTime = curTick() - stats.lastResetTick; + if (!nextApplyEvent.scheduled()) { + schedule(nextApplyEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextApplyEvent() +{ + if ((!activeBuffer.empty()) && + (postPushWBQueue.size() < postPushWBQueueSize)) { + PacketPtr pkt; + Tick entrance_tick; + WorkListItem items[numElementsPerLine]; + + std::tie(pkt, entrance_tick) = activeBuffer.front(); + pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + + for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) { + if (items[index].activeNow) { + Addr addr = pkt->getAddr() + index * sizeof(WorkListItem); + uint32_t delta = graphWorkload->apply(items[index]); + items[index].activeNow = false; + owner->recvVertexPush(addr, delta, items[index].edgeIndex, + items[index].degree); + pullsReceived--; + stats.verticesPushed++; + stats.lastVertexPushTime = curTick() - stats.lastResetTick; + } + } + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize); + + bool atom_active_now = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= items[index].activeNow; + } + // NOTE: If the atom is not active anymore. + if (!atom_active_now) { + PacketPtr wb_pkt = createWritePacket(pkt->getAddr(), + peerMemoryAtomSize, (uint8_t*) items); + postPushWBQueue.emplace_back(wb_pkt, curTick()); + activeBuffer.pop_front(); + memoryFunctionQueue.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextPostPushWB(ignore, schedule_tick); + }, 0, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + delete pkt; + } + } else if (!currentActiveCacheBlocks.empty()) { + int num_visited_indices = 0; + int initial_fifo_length = currentActiveCacheBlocks.size(); + while (true) { + int block_index = currentActiveCacheBlocks.front(); + if (cacheBlocks[block_index].state == CacheState::IDLE) { + for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) { + if (cacheBlocks[block_index].items[index].activeNow) { + Addr addr = cacheBlocks[block_index].addr + index * sizeof(WorkListItem); + uint32_t delta = graphWorkload->apply(cacheBlocks[block_index].items[index]); + cacheBlocks[block_index].items[index].activeNow = false; + cacheBlocks[block_index].dirty = true; + owner->recvVertexPush(addr, delta, + cacheBlocks[block_index].items[index].edgeIndex, + cacheBlocks[block_index].items[index].degree); + pullsReceived--; + stats.verticesPushed++; + stats.lastVertexPushTime = curTick() - stats.lastResetTick; + } + } + + bool atom_active_now = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + } + // NOTE: If we have reached the last item in the cache block + if (!atom_active_now) { + currentActiveCacheBlocks.erase(block_index); + } + break; + } + // NOTE: If the block with index at the front of activeCacheBlocks + // is not in IDLE state, then roll the that index to the back + currentActiveCacheBlocks.pop_front(); + currentActiveCacheBlocks.push_back(block_index); + // NOTE: If we have visited all the items initially in the FIFO. + num_visited_indices++; + if (num_visited_indices == initial_fifo_length) { + break; + } + } + } else { + DPRINTF(CoalesceEngine, "%s: Could not find work to apply.\n", __func__); + stats.worklessCycles++; + } + + if (pullCondition()) { + memoryFunctionQueue.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextVertexPull(ignore, schedule_tick); + }, 0, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + pullsScheduled++; + } + + if ((pullsReceived > 0) && (!nextApplyEvent.scheduled())) { + schedule(nextApplyEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextDoneSignalEvent() +{ + if (done()) { + owner->recvDoneSignal(); + } +} + +CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) + : statistics::Group(&_coalesce), + coalesce(_coalesce), + lastResetTick(0), + ADD_STAT(numVertexReads, statistics::units::Count::get(), + "Number of memory vertecies read from cache."), + ADD_STAT(numVertexWrites, statistics::units::Count::get(), + "Number of memory vertecies written to cache."), + ADD_STAT(readHits, statistics::units::Count::get(), + "Number of cache hits."), + ADD_STAT(readMisses, statistics::units::Count::get(), + "Number of cache misses."), + ADD_STAT(readHitUnderMisses, statistics::units::Count::get(), + "Number of cache hit under misses."), + ADD_STAT(numConflicts, statistics::units::Count::get(), + "Number of conflicts raised by reads in the cache."), + ADD_STAT(responsePortShortage, statistics::units::Count::get(), + "Number of times a response has been " + "delayed because of port shortage. "), + ADD_STAT(numMemoryBlocks, statistics::units::Count::get(), + "Number of times memory bandwidth was not available."), + ADD_STAT(wastefulBytesRead, statistics::units::Byte::get(), + "Number of bytes read that were not used by coalesce engine"), + ADD_STAT(verticesPulled, statistics::units::Count::get(), + "Number of times a pull request has been sent by PushEngine."), + ADD_STAT(verticesPushed, statistics::units::Count::get(), + "Number of times a vertex has been pushed to the PushEngine"), + ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(), + "Time of the last pull request. (Relative to reset_stats)"), + ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(), + "Time of the last vertex push. (Relative to reset_stats)"), + ADD_STAT(worklessCycles, statistics::units::Count::get(), + "cycles the coalesce engine could not find work for apply"), + ADD_STAT(hitRate, statistics::units::Ratio::get(), + "Hit rate in the cache."), + ADD_STAT(vertexPullBW, statistics::units::Rate::get(), + "Rate at which pull requests arrive."), + ADD_STAT(vertexPushBW, statistics::units::Rate::get(), + "Rate at which vertices are pushed."), + ADD_STAT(currentFrontierSize, statistics::units::Count::get(), + "Histogram of the length of the current bitvector."), + ADD_STAT(futureFrontierSize, statistics::units::Count::get(), + "Histogram of the length of the future bitvector."), + ADD_STAT(currentBlockActiveCount, statistics::units::Count::get(), + "Histogram of the popCount values in the current directory"), + ADD_STAT(futureBlockActiveCount, statistics::units::Count::get(), + "Histogram of the popCount values in the future directory"), + ADD_STAT(responseQueueLatency, statistics::units::Second::get(), + "Histogram of the response latency to WLEngine. (ns)"), + ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(), + "Histogram of the latency of processing a memory function.") +{ +} + +void +CoalesceEngine::CoalesceStats::regStats() +{ + using namespace statistics; + + hitRate = (readHits + readHitUnderMisses) / + (readHits + readHitUnderMisses + readMisses); + + vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime; + + vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime; + + currentFrontierSize.init(64); + futureFrontierSize.init(64); + currentBlockActiveCount.init(64); + futureBlockActiveCount.init(64); + responseQueueLatency.init(64); + memoryFunctionLatency.init(64); +} + +void +CoalesceEngine::CoalesceStats::resetStats() +{ + statistics::Group::resetStats(); + + lastResetTick = curTick(); +} + +} // namespace gem5 diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh new file mode 100644 index 0000000000..10a71a7ef1 --- /dev/null +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ +#define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ + +#include "accl/graph/base/data_structs.hh" +#include "accl/graph/base/graph_workload.hh" +#include "accl/graph/sega/base_memory_engine.hh" +#include "accl/graph/sega/enums.hh" +#include "accl/graph/sega/work_directory.hh" +#include "base/cprintf.hh" +#include "base/statistics.hh" +#include "params/CoalesceEngine.hh" + +namespace gem5 +{ + +class MPU; + +class CoalesceEngine : public BaseMemoryEngine +{ + private: + struct Block + { + WorkListItem* items; + Addr addr; + uint64_t busyMask; + bool valid; + bool dirty; + bool hasConflict; + CacheState state; + Tick lastChangedTick; + Block() {} + Block(int num_elements): + addr(-1), + busyMask(0), + valid(false), + dirty(false), + hasConflict(false), + state(CacheState::INVALID), + lastChangedTick(0) + { + items = new WorkListItem [num_elements]; + } + + void reset() { + addr = -1; + busyMask = 0; + valid = false; + dirty = false; + hasConflict = false; + state = CacheState::INVALID; + lastChangedTick = 0; + } + + std::string to_string() { + return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, " + "dirty: %s, hasConflict: %s, state: %s, lastChangedTick: %lu}", + addr, busyMask, valid ? "true" : "false", + dirty ? "true" : "false", hasConflict ? "true" : "false", + cacheStateStrings[state], lastChangedTick); + } + }; + + struct ReadPurpose : public Packet::SenderState + { + ReadDestination _dest; + ReadPurpose(ReadDestination dest): _dest(dest) {} + ReadDestination dest() { return _dest; } + }; + + MPU* owner; + ProcessingMode mode; + WorkDirectory* currentDirectory; + WorkDirectory* futureDirectory; + GraphWorkload* graphWorkload; + + Addr lastAtomAddr; + + int numLines; + int numElementsPerLine; + Block* cacheBlocks; + + int onTheFlyReqs; + std::unordered_map> MSHR; + + // Response route to WLEngine + int maxRespPerCycle; + std::deque> responseQueue; + + // Tracking work in cache + int pullsReceived; + // NOTE: Remember to erase from these upon eviction from cache + UniqueFIFO currentActiveCacheBlocks; + UniqueFIFO futureActiveCacheBlocks; + + int pullsScheduled; + int pendingPullLimit; + int pendingPullReads; + // A map from addr to sendMask. sendMask determines which bytes to + // send for push when getting the read response from memory. + std::unordered_set pendingPullAddrs; + + int activeBufferSize; + int postPushWBQueueSize; + std::deque> activeBuffer; + std::deque> postPushWBQueue; + + bool enoughSpace(); + bool pullCondition(); + int getBlockIndex(Addr addr); + + MemoryEvent nextMemoryEvent; + void processNextMemoryEvent(); + void processNextRead(int block_index, Tick schedule_tick); + void processNextWriteBack(int block_index, Tick schedule_tick); + void processNextVertexPull(int ignore, Tick schedule_tick); + void processNextPostPushWB(int ignore, Tick schedule_tick); + std::deque, int, Tick>> memoryFunctionQueue; + + EventFunctionWrapper nextResponseEvent; + void processNextResponseEvent(); + + EventFunctionWrapper nextApplyEvent; + void processNextApplyEvent(); + + EventFunctionWrapper nextDoneSignalEvent; + void processNextDoneSignalEvent(); + + struct CoalesceStats : public statistics::Group + { + CoalesceStats(CoalesceEngine &coalesce); + + virtual void regStats() override; + + virtual void resetStats() override; + + CoalesceEngine &coalesce; + + Tick lastResetTick; + + statistics::Scalar numVertexReads; + statistics::Scalar numVertexWrites; + statistics::Scalar readHits; + statistics::Scalar readMisses; + statistics::Scalar readHitUnderMisses; + statistics::Scalar numConflicts; + statistics::Scalar responsePortShortage; + statistics::Scalar numMemoryBlocks; + statistics::Scalar wastefulBytesRead; + statistics::Scalar verticesPulled; + statistics::Scalar verticesPushed; + statistics::Scalar lastVertexPullTime; + statistics::Scalar lastVertexPushTime; + statistics::Scalar worklessCycles; + + statistics::Formula hitRate; + statistics::Formula vertexPullBW; + statistics::Formula vertexPushBW; + + statistics::Histogram currentFrontierSize; + statistics::Histogram futureFrontierSize; + statistics::Histogram currentBlockActiveCount; + statistics::Histogram futureBlockActiveCount; + statistics::Histogram responseQueueLatency; + statistics::Histogram memoryFunctionLatency; + }; + + CoalesceStats stats; + + protected: + virtual void recvMemRetry() override; + virtual bool handleMemResp(PacketPtr pkt) override; + + public: + PARAMS(CoalesceEngine); + CoalesceEngine(const Params ¶ms); + void registerMPU(MPU* mpu); + + void setProcessingMode(ProcessingMode _mode) { mode = _mode; } + void createAsyncPopCountDirectory(int atoms_per_block); + void createBSPPopCountDirectory(int atoms_per_block); + void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; } + + virtual void recvFunctional(PacketPtr pkt); + void postMemInitSetup(); + void postConsumeProcess(); + void swapDirectories(); + + ReadReturnStatus recvWLRead(Addr addr); + void recvWLWrite(Addr addr, WorkListItem wl); + + int getSliceSize() + {return (int)(params().cache_size); } + // /sizeof(WorkListItem)); } + + int workCount(); + int futureWorkCount(); + void recvVertexPull(); + + bool done(); +}; + +} + +#endif // __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ diff --git a/src/accl/graph/sega/coalesce_engine_s.cc b/src/accl/graph/sega/coalesce_engine_s.cc new file mode 100644 index 0000000000..6a5261d38c --- /dev/null +++ b/src/accl/graph/sega/coalesce_engine_s.cc @@ -0,0 +1,1223 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/coalesce_engine.hh" + +#include + +#include "accl/graph/sega/mpu.hh" +#include "base/intmath.hh" +#include "debug/CacheBlockState.hh" +#include "debug/CoalesceEngine.hh" +#include "debug/SEGAStructureSize.hh" +#include "mem/packet_access.hh" +#include "sim/sim_exit.hh" + +namespace gem5 +{ + +CoalesceEngine::CoalesceEngine(const Params ¶ms): + BaseMemoryEngine(params), mode(ProcessingMode::NOT_SET), lastAtomAddr(0), + numLines((int) (params.cache_size / peerMemoryAtomSize)), + numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), + onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle), + pullsReceived(0), pullsScheduled(0), + pendingPullLimit(params.pending_pull_limit), + pendingPullReads(0), activeBufferSize(params.active_buffer_size), + postPushWBQueueSize(params.post_push_wb_queue_size), + nextMemoryEvent([this] { + processNextMemoryEvent(); + }, name() + ".nextMemoryEvent"), + nextResponseEvent([this] { + processNextResponseEvent(); + }, name() + ".nextResponseEvent"), + nextApplyEvent([this] { + processNextApplyEvent(); + }, name() + ".nextApplyEvent"), + nextDoneSignalEvent([this] { + processNextDoneSignalEvent(); + }, name() + ".nextDoneSignalEvent"), + stats(*this) +{ + assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine)); + cacheBlocks = new Block [numLines]; + for (int i = 0; i < numLines; i++) { + cacheBlocks[i] = Block(numElementsPerLine); + } + activeBuffer.clear(); + postPushWBQueue.clear(); +} + +void +CoalesceEngine::registerMPU(MPU* mpu) +{ + owner = mpu; +} + + +// NOTE: Used for initializing memory and reading the final answer +void +CoalesceEngine::recvFunctional(PacketPtr pkt) +{ + if (pkt->isRead()) { + assert(pkt->getSize() == peerMemoryAtomSize); + Addr addr = pkt->getAddr(); + int block_index = getBlockIndex(addr); + + if ((cacheBlocks[block_index].addr == addr) && + (cacheBlocks[block_index].valid)) { + assert(cacheBlocks[block_index].state == CacheState::IDLE); + + pkt->makeResponse(); + pkt->setDataFromBlock( + (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); + } else { + memPort.sendFunctional(pkt); + } + } else { + graphWorkload->init(pkt, currentDirectory); + if (pkt->getAddr() > lastAtomAddr) { + lastAtomAddr = pkt->getAddr(); + } + memPort.sendFunctional(pkt); + } +} + +void +CoalesceEngine::postMemInitSetup() +{ + currentDirectory->setLastAtomAddr(lastAtomAddr); +} + +void +CoalesceEngine::postConsumeProcess() +{ + Addr last_local_atom_addr = peerMemoryRange.removeIntlvBits(lastAtomAddr); + for (Addr local_addr = 0; local_addr <= last_local_atom_addr; local_addr += peerMemoryAtomSize) { + Addr addr = peerMemoryRange.addIntlvBits(local_addr); + int block_index = getBlockIndex(addr); + if (cacheBlocks[block_index].addr == addr) { + assert(cacheBlocks[block_index].valid); + assert(!cacheBlocks[block_index].hasConflict); + assert(cacheBlocks[block_index].state == CacheState::IDLE); + bool atom_active_future_before = false; + bool atom_active_future_after = false; + for (int index = 0; index < numElementsPerLine; index++) { + assert(!cacheBlocks[block_index].items[index].activeNow); + atom_active_future_before |= cacheBlocks[block_index].items[index].activeFuture; + graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]); + atom_active_future_after |= cacheBlocks[block_index].items[index].activeFuture; + if (cacheBlocks[block_index].items[index].activeFuture) { + cacheBlocks[block_index].items[index].activeFuture = false; + cacheBlocks[block_index].items[index].activeNow = true; + cacheBlocks[block_index].dirty = true; + } + } + if (!atom_active_future_before && atom_active_future_after) { + futureActiveCacheBlocks.push_back(block_index); + } + if (atom_active_future_before && !atom_active_future_after) { + futureActiveCacheBlocks.erase(block_index); + } + } else { + WorkListItem items[numElementsPerLine]; + PacketPtr read_pkt = createReadPacket(addr, peerMemoryAtomSize); + memPort.sendFunctional(read_pkt); + read_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + bool atom_active_future_before = false; + bool atom_active_future_after = false; + for (int index = 0; index < numElementsPerLine; index++) { + assert(!items[index].activeNow); + atom_active_future_before |= items[index].activeFuture; + graphWorkload->interIterationInit(items[index]); + atom_active_future_after |= items[index].activeFuture; + if (items[index].activeFuture) { + items[index].activeFuture = false; + items[index].activeNow = true; + } + } + if (!atom_active_future_before && atom_active_future_after) { + futureDirectory->activate(addr); + } + if (atom_active_future_before && !atom_active_future_after) { + futureDirectory->deactivate(addr); + } + PacketPtr write_pkt = createWritePacket(addr, peerMemoryAtomSize, (uint8_t*) items); + memPort.sendFunctional(write_pkt); + delete read_pkt; + delete write_pkt; + } + } +} + +void +CoalesceEngine::createAsyncPopCountDirectory(int atoms_per_block) +{ + currentDirectory = new PopCountDirectory( + peerMemoryRange, atoms_per_block, peerMemoryAtomSize); + futureDirectory = nullptr; +} + +void +CoalesceEngine::createBSPPopCountDirectory(int atoms_per_block) +{ + currentDirectory = new PopCountDirectory( + peerMemoryRange, atoms_per_block, peerMemoryAtomSize); + futureDirectory = new PopCountDirectory( + peerMemoryRange, atoms_per_block, peerMemoryAtomSize); +} + +void +CoalesceEngine::swapDirectories() +{ + assert(currentDirectory->empty()); + assert(currentActiveCacheBlocks.empty()); + // assert currentDirectory is empty + WorkDirectory* temp = currentDirectory; + currentDirectory = futureDirectory; + futureDirectory = temp; + + currentActiveCacheBlocks.clear(); + currentActiveCacheBlocks = futureActiveCacheBlocks; + futureActiveCacheBlocks.clear(); +} + +bool +CoalesceEngine::done() +{ + return memoryFunctionQueue.empty() && currentActiveCacheBlocks.empty() && + activeBuffer.empty() && currentDirectory->empty() && (onTheFlyReqs == 0); +} + +bool +CoalesceEngine::enoughSpace() +{ + return (activeBuffer.size() + pendingPullReads + pullsScheduled) < activeBufferSize; +} + +bool +CoalesceEngine::pullCondition() +{ + bool enough_space = enoughSpace(); + bool schedule_limit = pullsScheduled < pendingPullLimit; + return enough_space && schedule_limit; +} + +// addr should be aligned to peerMemoryAtomSize +int +CoalesceEngine::getBlockIndex(Addr addr) +{ + assert((addr % peerMemoryAtomSize) == 0); + Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr); + return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines; +} + + + +ReadReturnStatus +CoalesceEngine::recvWLRead(Addr addr) +{ + Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); + assert(aligned_addr % peerMemoryAtomSize == 0); + int block_index = getBlockIndex(aligned_addr); + assert(block_index < numLines); + int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); + assert(wl_offset < numElementsPerLine); + //assert(addr in a right slice) + // assert((cacheBlocks[block_index].addr == aligned_addr)) + DPRINTF(CoalesceEngine, "%s: Received a read request for addr: %lu. " + "This request maps to cacheBlocks[%d], aligned_addr: " + "%lu, and wl_offset: %d.\n", __func__, addr, + block_index, aligned_addr, wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + if ((cacheBlocks[block_index].addr == aligned_addr) and + (cacheBlocks[block_index].valid)) { + // Hit + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit.\n", __func__, addr); + stats.readHits++; + assert(cacheBlocks[block_index].state != CacheState::INVALID); + responseQueue.push_back(std::make_tuple( + addr, cacheBlocks[block_index].items[wl_offset], curTick())); + + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + // TODO: Stat to count the number of WLItems that have been touched. + cacheBlocks[block_index].busyMask |= (1 << wl_offset); + cacheBlocks[block_index].state = CacheState::BUSY; + // HACK: If a read happens on the same cycle as another operation such + // as apply set lastChangedTick to half a cycle later so that operation + // scheduled by the original operation (apply in this example) are + // invalidated. For more details refer to "accl/graph/sega/busyMaskErr" + cacheBlocks[block_index].lastChangedTick = + curTick() + (Tick) (clockPeriod() / 2); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + if (!nextResponseEvent.scheduled()) { + schedule(nextResponseEvent, nextCycle()); + } + stats.numVertexReads++; + return ReadReturnStatus::ACCEPT; + } else if (cacheBlocks[block_index].state == CacheState::PENDING_DATA) { + // Hit under miss + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit under miss.\n", + __func__, addr); + stats.readHitUnderMisses++; + assert(!cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].dirty); + + assert(MSHR.find(block_index) != MSHR.end()); + MSHR[block_index].push_back(addr); + DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to MSHR " + "for cacheBlocks[%d].\n", __func__, addr, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + stats.numVertexReads++; + return ReadReturnStatus::ACCEPT; + } else { + // // miss + assert(cacheBlocks[block_index].addr != aligned_addr); + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a cold miss.\n", + __func__, addr); + stats.readMisses++; + // cold miss + assert(MSHR.find(block_index) == MSHR.end()); + cacheBlocks[block_index].addr = aligned_addr; + cacheBlocks[block_index].busyMask = 0; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].dirty = false; + cacheBlocks[block_index].hasConflict = false; + cacheBlocks[block_index].state = CacheState::PENDING_DATA; + cacheBlocks[block_index].lastChangedTick = curTick(); + + MSHR[block_index].push_back(addr); + memoryFunctionQueue.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextRead(block_index, schedule_tick); + }, block_index, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + return ReadReturnStatus::ACCEPT; + } + } +} + +bool +CoalesceEngine::handleMemResp(PacketPtr pkt) +{ + assert(pkt->isResponse()); + DPRINTF(CoalesceEngine, "%s: Received packet: %s from memory.\n", + __func__, pkt->print()); + + onTheFlyReqs--; + if (pkt->isWrite()) { + DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__); + delete pkt; + } else { + assert(pkt->isRead()); + Addr addr = pkt->getAddr(); + int block_index = getBlockIndex(addr); + ReadPurpose* purpose = pkt->findNextSenderState(); + + // NOTE: Regardless of where the pkt will go we have to release the + // reserved space for this pkt in the activeBuffer in case + // it was read from memory for placement in the activeBuffer. + // NOTE: Also we have to stop tracking the address for pullAddrs + if (purpose->dest() == ReadDestination::READ_FOR_PUSH) { + pendingPullReads--; + pendingPullAddrs.erase(addr); + } + if (cacheBlocks[block_index].addr == addr) { + // If it is in the cache, line should be in PENDING_DATA state. + // Regardless of the purpose for which it was read, it should + // be placed in the cache array. + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].valid); + assert(!cacheBlocks[block_index].dirty); + assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA); + + // NOTE: Since it is in PENDING_DATA state it + // should have an entry in the MSHR. + assert(MSHR.find(block_index) != MSHR.end()); + + pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items, + peerMemoryAtomSize); + + cacheBlocks[block_index].valid = true; + // HACK: In case the pkt was read for push but it was allocated + // for in the cache later on, we should cancel the future + // processNextRead for this block. We could set lastChangedTick + // to curTick() like usual. However, there is no way to ensure + // that processNextRead will be not be called on the same tick + // as the pkt arrives from the memory. Therefore, we will set + // the lastChangedTick to half a cycle before the actual time. + // We move that back in time because it would be fine if + // processNextRead happened before pkt arriveed. processNextRead + // actually will check if there is a pending read for push for + // the address it's trying to populate. + if (purpose->dest() == ReadDestination::READ_FOR_PUSH) { + cacheBlocks[block_index].lastChangedTick = + curTick() - (Tick) (clockPeriod() / 2); + } else { + cacheBlocks[block_index].lastChangedTick = curTick(); + } + + // NOTE: If the atom is active we have to deactivate the tracking + // of this atom in the memory since it's not in memory anymore. + // Since it is going to the cache, cache will be responsible for + // tracking this. Push to activeCacheBlocks for simulator speed + // instead of having to search for active blocks in the cache. + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_now) { + int count = currentDirectory->deactivate(addr); + currentActiveCacheBlocks.push_back(block_index); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.currentBlockActiveCount.sample(count); + } + if (atom_active_future) { + int count = futureDirectory->deactivate(addr); + futureActiveCacheBlocks.push_back(block_index); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); + } + + assert(MSHR.find(block_index) != MSHR.end()); + for (auto it = MSHR[block_index].begin(); + it != MSHR[block_index].end();) { + Addr miss_addr = *it; + Addr aligned_miss_addr = + roundDown(miss_addr, peerMemoryAtomSize); + + assert(aligned_miss_addr == cacheBlocks[block_index].addr); + int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); + DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " + "cacheBlocks[%d] can be serviced with the received " + "packet.\n",__func__, miss_addr, block_index); + responseQueue.push_back(std::make_tuple(miss_addr, + cacheBlocks[block_index].items[wl_offset], curTick())); + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + cacheBlocks[block_index].busyMask |= (1 << wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + it = MSHR[block_index].erase(it); + } + MSHR.erase(block_index); + + cacheBlocks[block_index].state = CacheState::BUSY; + if ((!nextResponseEvent.scheduled()) && (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } + delete pkt; + } else { + assert(purpose->dest() == ReadDestination::READ_FOR_PUSH); + // There should be enough room in activeBuffer to place this pkt. + // REMEMBER: If dest == READ_FOR_PUSH we release the reserved space. + // So at this point in code we should have at least one free entry + // in the active buffer which is reserved for this pkt. + assert(activeBuffer.size() + pendingPullReads < activeBufferSize); + + WorkListItem items[numElementsPerLine]; + pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= items[index].activeNow; + atom_active_future |= items[index].activeFuture; + } + if (atom_active_now) { + int count = currentDirectory->deactivate(addr); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.currentBlockActiveCount.sample(count); + if (atom_active_future) { + int count = futureDirectory->deactivate(addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); + } + activeBuffer.emplace_back(pkt, curTick()); + } else { + stats.wastefulBytesRead += pkt->getSize(); + delete pkt; + } + + if (pullCondition()) { + memoryFunctionQueue.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextVertexPull(ignore, schedule_tick); + }, 0, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + pullsScheduled++; + } + } + } + + if (done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); + } + return true; +} + +void +CoalesceEngine::processNextResponseEvent() +{ + int num_responses_sent = 0; + + Addr addr_response; + WorkListItem worklist_response; + Tick response_queueing_tick; + while(true) { + std::tie(addr_response, worklist_response, response_queueing_tick) = + responseQueue.front(); + Tick waiting_ticks = curTick() - response_queueing_tick; + if (ticksToCycles(waiting_ticks) < 1) { + break; + } + owner->handleIncomingWL(addr_response, worklist_response); + num_responses_sent++; + DPRINTF(CoalesceEngine, + "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n", + __func__, + graphWorkload->printWorkListItem(worklist_response), + addr_response); + + responseQueue.pop_front(); + DPRINTF(SEGAStructureSize, "%s: Popped a response from responseQueue." + " responseQueue.size = %d.\n", __func__, + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Popped a response from responseQueue. " + "responseQueue.size = %d.\n", __func__, + responseQueue.size()); + stats.responseQueueLatency.sample( + waiting_ticks * 1e9 / getClockFrequency()); + if (num_responses_sent >= maxRespPerCycle) { + // TODO: Add the condition to check that front of queue can be + // sent to WLEngine. i.e. it has at least been in the queue for + // one cycle. + if (!responseQueue.empty()) { + stats.responsePortShortage++; + } + break; + } + if (responseQueue.empty()) { + break; + } + } + + if ((!nextResponseEvent.scheduled()) && + (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } +} + +void +CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) +{ + Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); + int block_index = getBlockIndex(aligned_addr); + int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); + DPRINTF(CoalesceEngine, "%s: Received a write request for addr: %lu with " + "wl: %s. This request maps to cacheBlocks[%d], " + "aligned_addr: %lu, and wl_offset: %d.\n", + __func__, addr, graphWorkload->printWorkListItem(wl), + block_index, aligned_addr, wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + DPRINTF(CoalesceEngine, "%s: Received a write for WorkListItem: %s " + "with Addr: %lu.\n", __func__, + graphWorkload->printWorkListItem(wl), addr); + + // NOTE: Design does not allow for write misses. + assert(cacheBlocks[block_index].addr == aligned_addr); + // cache state asserts + assert(cacheBlocks[block_index].busyMask != 0); + assert(cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].state == CacheState::BUSY); + + // respective bit in busyMask for wl is set. + assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) == + (1 << wl_offset)); + + if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) { + cacheBlocks[block_index].dirty |= true; + } + + bool active = graphWorkload->activeCondition(wl, cacheBlocks[block_index].items[wl_offset]); + cacheBlocks[block_index].items[wl_offset] = wl; + if (mode == ProcessingMode::ASYNCHRONOUS) { + cacheBlocks[block_index].items[wl_offset].activeNow |= active; + if (active && (!currentActiveCacheBlocks.find(block_index))) { + currentActiveCacheBlocks.push_back(block_index); + if (!owner->running()) { + owner->start(); + } + } + } + if (mode == ProcessingMode::BULK_SYNCHRONOUS) { + cacheBlocks[block_index].items[wl_offset].activeFuture |= active; + if (active && (!futureActiveCacheBlocks.find(block_index))) { + futureActiveCacheBlocks.push_back(block_index); + } + } + + cacheBlocks[block_index].busyMask &= ~(1 << wl_offset); + cacheBlocks[block_index].lastChangedTick = curTick(); + DPRINTF(CoalesceEngine, "%s: Wrote to cacheBlocks[%d][%d] = %s.\n", + __func__, block_index, wl_offset, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset])); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + if (cacheBlocks[block_index].busyMask == 0) { + if (cacheBlocks[block_index].hasConflict) { + if (cacheBlocks[block_index].dirty) { + cacheBlocks[block_index].state = CacheState::PENDING_WB; + cacheBlocks[block_index].lastChangedTick = curTick(); + memoryFunctionQueue.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextWriteBack(block_index, schedule_tick); + }, block_index, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + } else { + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_now) { + currentActiveCacheBlocks.erase(block_index); + int count = currentDirectory->activate(cacheBlocks[block_index].addr); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.currentBlockActiveCount.sample(count); + } + if (atom_active_future) { + futureActiveCacheBlocks.erase(block_index); + int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); + } + cacheBlocks[block_index].reset(); + } + } else { + cacheBlocks[block_index].state = CacheState::IDLE; + cacheBlocks[block_index].lastChangedTick = curTick(); + } + } + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + stats.numVertexWrites++; + + if ((cacheBlocks[block_index].state == CacheState::IDLE) && + done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextMemoryEvent() +{ + if (memPort.blocked()) { + stats.numMemoryBlocks++; + nextMemoryEvent.sleep(); + return; + } + + DPRINTF(CoalesceEngine, "%s: Processing another " + "memory function.\n", __func__); + std::function next_memory_function; + int next_memory_function_input; + Tick next_memory_function_tick; + std::tie( + next_memory_function, + next_memory_function_input, + next_memory_function_tick) = memoryFunctionQueue.front(); + next_memory_function(next_memory_function_input, next_memory_function_tick); + memoryFunctionQueue.pop_front(); + stats.memoryFunctionLatency.sample((curTick() - next_memory_function_tick) + * 1e9 / getClockFrequency()); + DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. " + "memoryFunctionQueue.size = %d.\n", __func__, + memoryFunctionQueue.size()); + + assert(!nextMemoryEvent.pending()); + assert(!nextMemoryEvent.scheduled()); + if ((!memoryFunctionQueue.empty())) { + schedule(nextMemoryEvent, nextCycle()); + } + + if (done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextRead(int block_index, Tick schedule_tick) +{ + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n", + __func__, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", + __func__, block_index, cacheBlocks[block_index].to_string()); + // A cache block should not be touched while it's waiting for data. + // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick); + // TODO: Figure out if this is still necessary. + if (cacheBlocks[block_index].lastChangedTick != schedule_tick) { + return; + } + + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].valid); + assert(!cacheBlocks[block_index].dirty); + assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA); + + bool need_send_pkt = true; + + // NOTE: Search postPushWBQueue + for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end();) + { + PacketPtr wb_pkt = std::get<0>(*wb); + if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) { + wb_pkt->writeDataToBlock( + (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); + cacheBlocks[block_index].valid = true; + cacheBlocks[block_index].dirty = true; + cacheBlocks[block_index].lastChangedTick = curTick(); + // NOTE: If an atom is in the postPushWBQueue, + // the it is definitely currently not active. + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) + { + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_future) { + futureActiveCacheBlocks.push_back(block_index); + } + + need_send_pkt = false; + wb = postPushWBQueue.erase(wb); + delete wb_pkt; + } else { + wb++; + } + } + // NOTE: Search activeBuffer + for (auto ab = activeBuffer.begin(); ab != activeBuffer.end();) { + PacketPtr ab_pkt = std::get<0>(*ab); + if (cacheBlocks[block_index].addr == ab_pkt->getAddr()) { + ab_pkt->writeDataToBlock( + (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); + + cacheBlocks[block_index].valid = true; + cacheBlocks[block_index].dirty = true; + cacheBlocks[block_index].lastChangedTick = curTick(); + // If an atom is in the activeBuffer, + // then it is definitely currently active. + currentActiveCacheBlocks.push_back(block_index); + // NOTE: Residence in the activeBuffer does not + // signify anything about future activity. + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) + { + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_future) { + futureActiveCacheBlocks.push_back(block_index); + } + + need_send_pkt = false; + ab = activeBuffer.erase(ab); + delete ab_pkt; + if (pullCondition()) { + memoryFunctionQueue.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextVertexPull(ignore, schedule_tick); + }, 0, curTick()); + pullsScheduled++; + } + } else { + ab++; + } + } + if (!need_send_pkt) { + for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) { + Addr miss_addr = *it; + Addr aligned_miss_addr = + roundDown(miss_addr, peerMemoryAtomSize); + assert(aligned_miss_addr == cacheBlocks[block_index].addr); + int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); + DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " + "cacheBlocks[%d] can be serviced with the received " + "packet.\n",__func__, miss_addr, block_index); + // TODO: Make this block of code into a function + responseQueue.push_back(std::make_tuple(miss_addr, + cacheBlocks[block_index].items[wl_offset], curTick())); + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + cacheBlocks[block_index].busyMask |= (1 << wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", + __func__, block_index, + cacheBlocks[block_index].to_string()); + it = MSHR[block_index].erase(it); + } + assert(MSHR[block_index].empty()); + MSHR.erase(block_index); + if ((!nextResponseEvent.scheduled()) && + (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } + cacheBlocks[block_index].state = CacheState::BUSY; + } + + if (pendingPullAddrs.find(cacheBlocks[block_index].addr) != + pendingPullAddrs.end()) { + need_send_pkt = false; + } + + if (need_send_pkt) { + PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr, + peerMemoryAtomSize); + ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_CACHE); + pkt->pushSenderState(purpose); + DPRINTF(CoalesceEngine, "%s: Created a read packet. addr = %lu, " + "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); + memPort.sendPacket(pkt); + onTheFlyReqs++; + } +} + +void +CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) +{ + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n", + __func__, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + if (schedule_tick == cacheBlocks[block_index].lastChangedTick) { + assert(cacheBlocks[block_index].busyMask == 0); + assert(cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].dirty); + assert(cacheBlocks[block_index].hasConflict); + assert(cacheBlocks[block_index].state == CacheState::PENDING_WB); + + // NOTE: If the atom we're writing back is active, we have to + // stop tracking it in the cache and start tracking it in the memory. + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + + PacketPtr pkt = createWritePacket( + cacheBlocks[block_index].addr, peerMemoryAtomSize, + (uint8_t*) cacheBlocks[block_index].items); + DPRINTF(CoalesceEngine, "%s: Created a write packet to " + "Addr: %lu, size = %d.\n", __func__, + pkt->getAddr(), pkt->getSize()); + if (atom_active_future) { + futureActiveCacheBlocks.erase(block_index); + } + if (atom_active_now) { + currentActiveCacheBlocks.erase(block_index); + if (enoughSpace()) { + activeBuffer.emplace_back(pkt, curTick()); + } else { + int count = currentDirectory->activate(cacheBlocks[block_index].addr); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.currentBlockActiveCount.sample(count); + if (atom_active_future) { + int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); + } + memPort.sendPacket(pkt); + onTheFlyReqs++; + } + } else { + if (atom_active_future) { + int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); + } + memPort.sendPacket(pkt); + onTheFlyReqs++; + } + cacheBlocks[block_index].reset(); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + } else { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a " + "write back has been scheduled for it. Ignoring " + "the current write back scheduled at tick %lu for " + "the right function scheduled later.\n", + __func__, block_index, schedule_tick); + } +} + +void +CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick) +{ + if (!postPushWBQueue.empty()) { + PacketPtr wb_pkt; + Tick pkt_tick; + std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front(); + if (schedule_tick == pkt_tick) { + WorkListItem items[numElementsPerLine]; + wb_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_future |= items[index].activeFuture; + } + if (atom_active_future) { + futureDirectory->activate(wb_pkt->getAddr()); + } + memPort.sendPacket(wb_pkt); + onTheFlyReqs++; + postPushWBQueue.pop_front(); + } + } +} + +void +CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) +{ + pullsScheduled--; + if (!currentDirectory->empty()) { + Addr addr = currentDirectory->getNextWork(); + int block_index = getBlockIndex(addr); + + bool in_cache = cacheBlocks[block_index].addr == addr; + bool in_active_buffer = false; + for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) { + PacketPtr pkt = std::get<0>(*ab); + in_active_buffer |= (pkt->getAddr() == addr); + } + bool in_write_buffer = false; + for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++) + { + PacketPtr pkt = std::get<0>(*wb); + in_write_buffer |= (pkt->getAddr() == addr); + } + bool repeat_work = pendingPullAddrs.find(addr) != pendingPullAddrs.end(); + + if (!in_cache && !in_active_buffer && !in_write_buffer && !repeat_work) { + PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize); + ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_PUSH); + pkt->pushSenderState(purpose); + memPort.sendPacket(pkt); + onTheFlyReqs++; + pendingPullReads++; + pendingPullAddrs.insert(addr); + } + } +} + +void +CoalesceEngine::recvMemRetry() +{ + DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__); + + if (!nextMemoryEvent.pending()) { + DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__); + return; + } + assert(!nextMemoryEvent.scheduled()); + nextMemoryEvent.wake(); + schedule(nextMemoryEvent, nextCycle()); +} + +int +CoalesceEngine::workCount() +{ + return currentActiveCacheBlocks.size() + currentDirectory->workCount() + activeBuffer.size(); +} + +void +CoalesceEngine::recvVertexPull() +{ + pullsReceived++; + DPRINTF(CoalesceEngine, "%s: Received a vertex pull. pullsReceived: %d.\n", __func__, pullsReceived); + + stats.verticesPulled++; + stats.lastVertexPullTime = curTick() - stats.lastResetTick; + if (!nextApplyEvent.scheduled()) { + schedule(nextApplyEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextApplyEvent() +{ + if ((!activeBuffer.empty()) && + (postPushWBQueue.size() < postPushWBQueueSize)) { + PacketPtr pkt; + Tick entrance_tick; + WorkListItem items[numElementsPerLine]; + + std::tie(pkt, entrance_tick) = activeBuffer.front(); + pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + + for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) { + if (items[index].activeNow) { + Addr addr = pkt->getAddr() + index * sizeof(WorkListItem); + uint32_t delta = graphWorkload->apply(items[index]); + items[index].activeNow = false; + owner->recvVertexPush(addr, delta, items[index].edgeIndex, + items[index].degree); + pullsReceived--; + stats.verticesPushed++; + stats.lastVertexPushTime = curTick() - stats.lastResetTick; + } + } + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize); + + bool atom_active_now = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= items[index].activeNow; + } + // NOTE: If the atom is not active anymore. + if (!atom_active_now) { + PacketPtr wb_pkt = createWritePacket(pkt->getAddr(), + peerMemoryAtomSize, (uint8_t*) items); + postPushWBQueue.emplace_back(wb_pkt, curTick()); + activeBuffer.pop_front(); + memoryFunctionQueue.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextPostPushWB(ignore, schedule_tick); + }, 0, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + delete pkt; + } + } else if (!currentActiveCacheBlocks.empty()) { + int num_visited_indices = 0; + int initial_fifo_length = currentActiveCacheBlocks.size(); + while (true) { + int block_index = currentActiveCacheBlocks.front(); + if (cacheBlocks[block_index].state == CacheState::IDLE) { + for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) { + if (cacheBlocks[block_index].items[index].activeNow) { + Addr addr = cacheBlocks[block_index].addr + index * sizeof(WorkListItem); + uint32_t delta = graphWorkload->apply(cacheBlocks[block_index].items[index]); + cacheBlocks[block_index].items[index].activeNow = false; + cacheBlocks[block_index].dirty = true; + owner->recvVertexPush(addr, delta, + cacheBlocks[block_index].items[index].edgeIndex, + cacheBlocks[block_index].items[index].degree); + pullsReceived--; + stats.verticesPushed++; + stats.lastVertexPushTime = curTick() - stats.lastResetTick; + } + } + + bool atom_active_now = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + } + // NOTE: If we have reached the last item in the cache block + if (!atom_active_now) { + currentActiveCacheBlocks.erase(block_index); + } + break; + } + // NOTE: If the block with index at the front of activeCacheBlocks + // is not in IDLE state, then roll the that index to the back + currentActiveCacheBlocks.pop_front(); + currentActiveCacheBlocks.push_back(block_index); + // NOTE: If we have visited all the items initially in the FIFO. + num_visited_indices++; + if (num_visited_indices == initial_fifo_length) { + break; + } + } + } else { + DPRINTF(CoalesceEngine, "%s: Could not find work to apply.\n", __func__); + stats.worklessCycles++; + } + + if (pullCondition()) { + memoryFunctionQueue.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextVertexPull(ignore, schedule_tick); + }, 0, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + pullsScheduled++; + } + + if ((pullsReceived > 0) && (!nextApplyEvent.scheduled())) { + schedule(nextApplyEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextDoneSignalEvent() +{ + if (done()) { + owner->recvDoneSignal(); + } +} + +CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) + : statistics::Group(&_coalesce), + coalesce(_coalesce), + lastResetTick(0), + ADD_STAT(numVertexReads, statistics::units::Count::get(), + "Number of memory vertecies read from cache."), + ADD_STAT(numVertexWrites, statistics::units::Count::get(), + "Number of memory vertecies written to cache."), + ADD_STAT(readHits, statistics::units::Count::get(), + "Number of cache hits."), + ADD_STAT(readMisses, statistics::units::Count::get(), + "Number of cache misses."), + ADD_STAT(readHitUnderMisses, statistics::units::Count::get(), + "Number of cache hit under misses."), + ADD_STAT(numConflicts, statistics::units::Count::get(), + "Number of conflicts raised by reads in the cache."), + ADD_STAT(responsePortShortage, statistics::units::Count::get(), + "Number of times a response has been " + "delayed because of port shortage. "), + ADD_STAT(numMemoryBlocks, statistics::units::Count::get(), + "Number of times memory bandwidth was not available."), + ADD_STAT(wastefulBytesRead, statistics::units::Byte::get(), + "Number of bytes read that were not used by coalesce engine"), + ADD_STAT(verticesPulled, statistics::units::Count::get(), + "Number of times a pull request has been sent by PushEngine."), + ADD_STAT(verticesPushed, statistics::units::Count::get(), + "Number of times a vertex has been pushed to the PushEngine"), + ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(), + "Time of the last pull request. (Relative to reset_stats)"), + ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(), + "Time of the last vertex push. (Relative to reset_stats)"), + ADD_STAT(worklessCycles, statistics::units::Count::get(), + "cycles the coalesce engine could not find work for apply"), + ADD_STAT(hitRate, statistics::units::Ratio::get(), + "Hit rate in the cache."), + ADD_STAT(vertexPullBW, statistics::units::Rate::get(), + "Rate at which pull requests arrive."), + ADD_STAT(vertexPushBW, statistics::units::Rate::get(), + "Rate at which vertices are pushed."), + ADD_STAT(currentFrontierSize, statistics::units::Count::get(), + "Histogram of the length of the current bitvector."), + ADD_STAT(futureFrontierSize, statistics::units::Count::get(), + "Histogram of the length of the future bitvector."), + ADD_STAT(currentBlockActiveCount, statistics::units::Count::get(), + "Histogram of the popCount values in the current directory"), + ADD_STAT(futureBlockActiveCount, statistics::units::Count::get(), + "Histogram of the popCount values in the future directory"), + ADD_STAT(responseQueueLatency, statistics::units::Second::get(), + "Histogram of the response latency to WLEngine. (ns)"), + ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(), + "Histogram of the latency of processing a memory function.") +{ +} + +void +CoalesceEngine::CoalesceStats::regStats() +{ + using namespace statistics; + + hitRate = (readHits + readHitUnderMisses) / + (readHits + readHitUnderMisses + readMisses); + + vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime; + + vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime; + + currentFrontierSize.init(64); + futureFrontierSize.init(64); + currentBlockActiveCount.init(64); + futureBlockActiveCount.init(64); + responseQueueLatency.init(64); + memoryFunctionLatency.init(64); +} + +void +CoalesceEngine::CoalesceStats::resetStats() +{ + statistics::Group::resetStats(); + + lastResetTick = curTick(); +} + +} // namespace gem5 diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc new file mode 100644 index 0000000000..f7ef96197f --- /dev/null +++ b/src/accl/graph/sega/enums.cc @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/enums.hh" + +namespace gem5 +{ + +const char* cacheStateStrings[NUM_CACHE_STATE] = { + "INVALID", + "PENDING_DATA", + "BUSY", + "IDLE", + "PENDING_WB" +}; + +const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS] = +{ + "ACCEPT", + "REJECT_ROLL", + "REJECT_NO_ROLL" +}; + +const char* readDestinationStrings[NUM_READ_DESTINATION] = +{ + "READ_FOR_CACHE", + "READ_FOR_PUSH" +}; + +const char* processingModeStrings[NUM_PROCESSING_MODE] = +{ + "NOT_SET", + "ASYNCHRONOUS", + "BULK_SYNCHRONOUS" +}; + +} // namespace gem5 diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh new file mode 100644 index 0000000000..f97c33a0e0 --- /dev/null +++ b/src/accl/graph/sega/enums.hh @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_ENUMS_HH__ +#define __ACCL_GRAPH_SEGA_ENUMS_HH__ + +namespace gem5 +{ + +enum CacheState +{ + INVALID, + PENDING_DATA, + BUSY, + IDLE, + PENDING_WB, + NUM_CACHE_STATE +}; +extern const char* cacheStateStrings[NUM_CACHE_STATE]; + +enum ReadReturnStatus +{ + ACCEPT, + REJECT_ROLL, + REJECT_NO_ROLL, + NUM_READ_RETURN_STATUS +}; +extern const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS]; + +enum ReadDestination +{ + READ_FOR_CACHE, + READ_FOR_PUSH, + NUM_READ_DESTINATION +}; +extern const char* readDestinationStrings[NUM_READ_DESTINATION]; + +enum ProcessingMode +{ + NOT_SET, + ASYNCHRONOUS, + BULK_SYNCHRONOUS, + NUM_PROCESSING_MODE +}; +extern const char* processingModeStrings[NUM_PROCESSING_MODE]; + +} // namespace gem5 + +#endif // __ACCL_GRAPH_SEGA_ENUMS_HH__ diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc new file mode 100644 index 0000000000..318ea0798b --- /dev/null +++ b/src/accl/graph/sega/mpu.cc @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/mpu.hh" + +#include + +#include "accl/graph/sega/centeral_controller.hh" +#include "debug/MPU.hh" +#include "mem/packet_access.hh" +#include "sim/sim_exit.hh" + +namespace gem5 +{ + +MPU::MPU(const Params& params): + ClockedObject(params), + system(params.system), + wlEngine(params.wl_engine), + coalesceEngine(params.coalesce_engine), + pushEngine(params.push_engine), + sliceCounter(0), + nextSliceEvent([this] { processNextSliceEvent(); }, name()) +{ + wlEngine->registerMPU(this); + coalesceEngine->registerMPU(this); + pushEngine->registerMPU(this); +} + +void +MPU::registerCenteralController(CenteralController* centeral_controller) +{ + centeralController = centeral_controller; +} + +int +MPU::getSliceSize() +{ + int slice_number = + (coalesceEngine->getSliceSize() * centeralController->getnumGPTs()); + + return slice_number; +} + +bool +MPU::bufferRemoteUpdate(int slice_number, PacketPtr pkt) +{ + return centeralController->bufferRemoteUpdate(slice_number, pkt); +} + +bool +MPU::handleIncomingUpdate(PacketPtr pkt) +{ + return wlEngine->handleIncomingUpdate(pkt); +} + +void +MPU::scheduleNewSlice() +{ + if (!nextSliceEvent.scheduled()) { + schedule(nextSliceEvent, nextCycle()); + } + return; +} + +void +MPU::processNextSliceEvent() +{ + auto new_update = + centeralController->remoteUpdates[this][this->getSliceCounter()].front(); + bool sent = wlEngine->handleIncomingUpdate(new_update); + + centeralController->remoteUpdates[this] + [this->getSliceCounter()].pop_front(); + if (!sent) { + centeralController->remoteUpdates[this] + [this->getSliceCounter()].push_back(new_update); + } + + if (!centeralController->remoteUpdates[this][this->getSliceCounter()].empty() && !nextSliceEvent.scheduled()) { + schedule(nextSliceEvent, nextCycle()); + } + +} + +void +MPU::handleIncomingWL(Addr addr, WorkListItem wl) +{ + wlEngine->handleIncomingWL(addr, wl); +} + +void +MPU::recvWLWrite(Addr addr, WorkListItem wl) +{ + coalesceEngine->recvWLWrite(addr, wl); +} + +void +MPU::recvWorkload(GraphWorkload* workload) +{ + coalesceEngine->recvWorkload(workload); + pushEngine->recvWorkload(workload); + wlEngine->recvWorkload(workload); +} + +void +MPU::recvVertexPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree) +{ + pushEngine->recvVertexPush(addr, delta, edge_index, degree); +} + +void +MPU::recvDoneSignal() +{ + if (done()) { + centeralController->recvDoneSignal(); + } +} + +bool +MPU::done() +{ + return wlEngine->done() && coalesceEngine->done() && pushEngine->done(); +} + +} // namespace gem5 diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh new file mode 100644 index 0000000000..2008a7dc4f --- /dev/null +++ b/src/accl/graph/sega/mpu.hh @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_MPU_HH__ +#define __ACCL_GRAPH_SEGA_MPU_HH__ + +#include +#include + +#include "accl/graph/base/data_structs.hh" +#include "accl/graph/sega/coalesce_engine.hh" +#include "accl/graph/sega/enums.hh" +#include "accl/graph/sega/push_engine.hh" +#include "accl/graph/sega/wl_engine.hh" +#include "base/addr_range.hh" +#include "mem/packet.hh" +#include "sim/clocked_object.hh" +#include "sim/system.hh" +#include "params/MPU.hh" + +namespace gem5 +{ + +class CenteralController; + +class MPU : public ClockedObject +{ + private: + System* system; + CenteralController* centeralController; + + WLEngine* wlEngine; + CoalesceEngine* coalesceEngine; + PushEngine* pushEngine; + int sliceCounter; + + EventFunctionWrapper nextSliceEvent; + void processNextSliceEvent(); + public: + PARAMS(MPU); + MPU(const Params& params); + void registerCenteralController(CenteralController* centeral_controller); + + void setProcessingMode(ProcessingMode mode) { coalesceEngine->setProcessingMode(mode); } + void createAsyncPopCountDirectory(int atoms_per_block) { coalesceEngine->createAsyncPopCountDirectory(atoms_per_block); } + void createBSPPopCountDirectory(int atoms_per_block) { coalesceEngine->createBSPPopCountDirectory(atoms_per_block); } + + unsigned int vertexAtomSize() { return coalesceEngine->params().attached_memory_atom_size; } + AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); } + void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); } + void postMemInitSetup() { coalesceEngine->postMemInitSetup(); } + void postConsumeProcess() { coalesceEngine->postConsumeProcess(); } + void swapDirectories() { coalesceEngine->swapDirectories(); } + + int getSliceSize(); + int getSliceCounter() { return sliceCounter; } + int increaseSliceCounter() { return sliceCounter++; } + void updateSliceCounter(int value) { sliceCounter = value;} + void resetSliceCounter() { sliceCounter = 0; } + bool bufferRemoteUpdate(int slice_number, PacketPtr pkt); + void scheduleNewSlice(); + + bool handleIncomingUpdate(PacketPtr pkt); + void handleIncomingWL(Addr addr, WorkListItem wl); + ReadReturnStatus recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); } + void recvWLWrite(Addr addr, WorkListItem wl); + void recvWorkload(GraphWorkload* Workload); + + int workCount() { return coalesceEngine->workCount(); } + void recvVertexPull() { return coalesceEngine->recvVertexPull(); } + bool running() { return pushEngine->running(); } + void start() { return pushEngine->start(); } + void recvVertexPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree); + + void recvDoneSignal(); + bool done(); +}; + +} // namespace gem5 + +#endif // __ACCL_GRAPH_SEGA_MPU_HH__ diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc new file mode 100644 index 0000000000..981b581b7c --- /dev/null +++ b/src/accl/graph/sega/push_engine.cc @@ -0,0 +1,515 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/push_engine.hh" + +#include "accl/graph/sega/mpu.hh" +#include "debug/PushEngine.hh" +#include "mem/packet_access.hh" +#include "sim/sim_exit.hh" + +namespace gem5 +{ + +PushEngine::PushEngine(const Params& params): + BaseMemoryEngine(params), + _running(false), + lastIdleEntranceTick(0), + numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size), + onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size), + maxPropagatesPerCycle(params.max_propagates_per_cycle), + updateQueueSize(params.update_queue_size), + nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()), + nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()), + nextPropagateEvent([this] { processNextPropagateEvent(); }, name()), + nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name()), + stats(*this) +{ + for (int i = 0; i < params.port_out_ports_connection_count; ++i) { + outPorts.emplace_back( + name() + ".out_ports" + std::to_string(i), this, i); + } +} + +Port& +PushEngine::getPort(const std::string& if_name, PortID idx) +{ + if (if_name == "out_ports") { + return outPorts[idx]; + } else if (if_name == "mem_port") { + return BaseMemoryEngine::getPort(if_name, idx); + } else { + return ClockedObject::getPort(if_name, idx); + } +} + +void +PushEngine::init() +{ + localAddrRange = owner->getAddrRanges(); + for (int i = 0; i < outPorts.size(); i++){ + portAddrMap[outPorts[i].id()] = outPorts[i].getAddrRanges(); + } +} + +void +PushEngine::registerMPU(MPU* mpu) +{ + owner = mpu; +} + +void +PushEngine::ReqPort::sendPacket(PacketPtr pkt) +{ + panic_if(blockedPacket != nullptr, + "Should never try to send if blocked!"); + // If we can't send the packet across the port, store it for later. + if (!sendTimingReq(pkt)) + { + DPRINTF(PushEngine, "%s: Packet is blocked.\n", __func__); + blockedPacket = pkt; + } +} + +bool +PushEngine::ReqPort::recvTimingResp(PacketPtr pkt) +{ + panic("recvTimingResp called on the request port."); +} + +void +PushEngine::ReqPort::recvReqRetry() +{ + panic_if(blockedPacket == nullptr, + "Received retry without a blockedPacket."); + + DPRINTF(PushEngine, "%s: ReqPort %d received a reqRetry. blockedPacket: %s.\n", __func__, _id, blockedPacket->print()); + PacketPtr pkt = blockedPacket; + blockedPacket = nullptr; + sendPacket(pkt); + if (blockedPacket == nullptr) { + DPRINTF(PushEngine, "%s: blockedPacket sent successfully.\n", __func__); + owner->recvReqRetry(); + } +} + +void +PushEngine::recvReqRetry() +{ + DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__); + if (!nextUpdatePushEvent.scheduled()) { + schedule(nextUpdatePushEvent, nextCycle()); + } +} + +bool +PushEngine::vertexSpace() +{ + return (edgePointerQueueSize == 0) || + ((edgePointerQueue.size() + numPendingPulls) < edgePointerQueueSize); +} + +bool +PushEngine::workLeft() +{ + return ((owner->workCount() - numPendingPulls) > 0); +} + +bool +PushEngine::done() +{ + bool empty_update_queues = true; + for (int i = 0; i < outPorts.size(); i++) { + empty_update_queues &= updateQueues[outPorts[i].id()].empty(); + } + return empty_update_queues && metaEdgeQueue.empty() && + (onTheFlyMemReqs == 0) && edgePointerQueue.empty(); +} + +void +PushEngine::start() +{ + assert(!_running); + // assert(!nextVertexPullEvent.scheduled()); + + _running = true; + // stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick); + // NOTE: We might have to check for size availability here. + assert(workLeft()); + if (vertexSpace() && !nextVertexPullEvent.scheduled()) { + schedule(nextVertexPullEvent, nextCycle()); + } +} + +void +PushEngine::processNextVertexPullEvent() +{ + if (workLeft()) { + numPendingPulls++; + owner->recvVertexPull(); + if (vertexSpace() && (!nextVertexPullEvent.scheduled())) { + schedule(nextVertexPullEvent, nextCycle()); + } + } else { + _running = false; + lastIdleEntranceTick = curTick(); + DPRINTF(PushEngine, "%s: In idle state now.\n", __func__); + } +} + +void +PushEngine::recvVertexPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree) +{ + assert(degree > 0); + assert((edgePointerQueueSize == 0) || + ((edgePointerQueue.size() + numPendingPulls) <= edgePointerQueueSize)); + + Addr start_addr = edge_index * sizeof(Edge); + Addr end_addr = start_addr + (degree * sizeof(Edge)); + EdgeReadInfoGen info_gen(addr, delta, start_addr, end_addr, + sizeof(Edge), peerMemoryAtomSize); + + edgePointerQueue.emplace_back(info_gen, curTick()); + stats.edgePointerQueueLength.sample(edgePointerQueue.size()); + numPendingPulls--; + + if (vertexSpace() && (!nextVertexPullEvent.scheduled())) { + schedule(nextVertexPullEvent, nextCycle()); + } + + if ((!nextMemoryReadEvent.pending()) && + (!nextMemoryReadEvent.scheduled())) { + schedule(nextMemoryReadEvent, nextCycle()); + } +} + +void +PushEngine::processNextMemoryReadEvent() +{ + if (memPort.blocked()) { + nextMemoryReadEvent.sleep(); + return; + } + Addr aligned_addr, offset; + int num_edges; + + EdgeReadInfoGen& curr_info = std::get<0>(edgePointerQueue.front()); + Tick entrance_tick = std::get<1>(edgePointerQueue.front()); + std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo(); + if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges))) + { + DPRINTF(PushEngine, "%s: Current packet information generated by " + "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, " + "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges); + + PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize); + PushInfo push_info = {curr_info.src(), curr_info.delta(), offset, num_edges}; + reqInfoMap[pkt->req] = push_info; + memPort.sendPacket(pkt); + onTheFlyMemReqs += num_edges; + + curr_info.iterate(); + if (curr_info.done()) { + DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__); + stats.edgePointerQueueLatency.sample( + (curTick() - entrance_tick) * 1e9 / getClockFrequency()); + edgePointerQueue.pop_front(); + stats.edgePointerQueueLength.sample(edgePointerQueue.size()); + DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. " + "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size()); + } + } + + if (vertexSpace() && (!nextVertexPullEvent.scheduled())) { + schedule(nextVertexPullEvent, nextCycle()); + } + + if (!edgePointerQueue.empty()) { + assert(!nextMemoryReadEvent.pending()); + assert(!nextMemoryReadEvent.scheduled()); + schedule(nextMemoryReadEvent, nextCycle()); + } +} + +void +PushEngine::recvMemRetry() +{ + if (nextMemoryReadEvent.pending()) { + DPRINTF(PushEngine, "%s: Received a memory retry.\n", __func__); + nextMemoryReadEvent.wake(); + schedule(nextMemoryReadEvent, nextCycle()); + } +} + +bool +PushEngine::handleMemResp(PacketPtr pkt) +{ + // TODO: in case we need to edit edges, get rid of second statement. + assert(pkt->isResponse() && (!pkt->isWrite())); + + // uint8_t* pkt_data = new uint8_t [peerMemoryAtomSize]; + // TODO: Change above line to below line. + uint8_t pkt_data [peerMemoryAtomSize]; + PushInfo push_info = reqInfoMap[pkt->req]; + pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize); + + for (int i = 0; i < push_info.numElements; i++) { + Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge)); + Addr edge_dst = edge->neighbor; + uint32_t edge_weight = edge->weight; + MetaEdge meta_edge( + push_info.src, edge_dst, edge_weight, push_info.value); + metaEdgeQueue.emplace_back(meta_edge, curTick()); + stats.edgeQueueLength.sample(metaEdgeQueue.size()); + } + stats.numWastefulEdgesRead += + (peerMemoryAtomSize / sizeof(Edge)) - push_info.numElements; + + onTheFlyMemReqs -= push_info.numElements; + reqInfoMap.erase(pkt->req); + // delete [] pkt_data; + delete pkt; + + if (!nextPropagateEvent.scheduled()) { + schedule(nextPropagateEvent, nextCycle()); + } + return true; +} + +void +PushEngine::processNextPropagateEvent() +{ + int num_propagates = 0; + while(true) { + MetaEdge meta_edge; + Tick entrance_tick; + std::tie(meta_edge, entrance_tick) = metaEdgeQueue.front(); + + DPRINTF(PushEngine, "%s: The edge to process is %s.\n", + __func__, meta_edge.to_string()); + + uint32_t update_value = + graphWorkload->propagate(meta_edge.value, meta_edge.weight); + Update update(meta_edge.src, meta_edge.dst, update_value); + metaEdgeQueue.pop_front(); + + if (enqueueUpdate(update)) { + DPRINTF(PushEngine, "%s: Sent %s to port queues.\n", + __func__, meta_edge.to_string()); + stats.numPropagates++; + stats.edgeQueueLatency.sample( + (curTick() - entrance_tick) * 1e9 / getClockFrequency()); + stats.edgeQueueLength.sample(metaEdgeQueue.size()); + } else { + metaEdgeQueue.emplace_back(meta_edge, entrance_tick); + } + num_propagates++; + + if (metaEdgeQueue.empty()) { + break; + } + if (num_propagates >= maxPropagatesPerCycle) { + break; + } + } + + stats.numPropagatesHist.sample(num_propagates); + + assert(!nextPropagateEvent.scheduled()); + if (!metaEdgeQueue.empty()) { + schedule(nextPropagateEvent, nextCycle()); + } +} + +bool +PushEngine::enqueueUpdate(Update update) +{ + Addr dst_addr = update.dst; + bool found_coalescing = false; + bool found_locally = false; + bool accepted = false; + for (auto range : localAddrRange) { + found_locally |= range.contains(dst_addr); + } + DPRINTF(PushEngine, "%s: Received update: %s.\n", __func__, update.to_string()); + for (int i = 0; i < outPorts.size(); i++) { + AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()]; + if (contains(addr_range_list, dst_addr)) { + DPRINTF(PushEngine, "%s: Update: %s belongs to port %d.\n", + __func__, update.to_string(), outPorts[i].id()); + DPRINTF(PushEngine, "%s: There are %d updates already " + "in queue for port %d.\n", __func__, + updateQueues[outPorts[i].id()].size(), + outPorts[i].id()); + for (auto& entry: updateQueues[outPorts[i].id()]) { + Update& curr_update = std::get<0>(entry); + if (curr_update.dst == update.dst) { + uint32_t old_value = curr_update.value; + curr_update.value = graphWorkload->reduce(old_value, update.value); + DPRINTF(PushEngine, "%s: found a coalescing opportunity " + "for destination %d with new value: %d by " + "coalescing %d and %d. \n", __func__, update.dst, + curr_update.value, old_value, update.value); + found_coalescing = true; + accepted = true; + stats.updateQueueCoalescions++; + } + } + if ((found_coalescing == false) && + (updateQueues[outPorts[i].id()].size() < updateQueueSize)) { + DPRINTF(PushEngine, "%s: There is a free entry available " + "in queue %d.\n", __func__, outPorts[i].id()); + updateQueues[outPorts[i].id()].emplace_back(update, curTick()); + DPRINTF(PushEngine, "%s: Emplaced the update at the back " + "of queue for port %d is. Size of queue " + "for port %d is %d.\n", __func__, + outPorts[i].id(), outPorts[i].id(), + updateQueues[outPorts[i].id()].size()); + accepted = true; + stats.updateQueueLength.sample( + updateQueues[outPorts[i].id()].size()); + } + } + } + + if (accepted && (!nextUpdatePushEvent.scheduled())) { + schedule(nextUpdatePushEvent, nextCycle()); + } + + return accepted; +} + +template PacketPtr +PushEngine::createUpdatePacket(Addr addr, T value) +{ + RequestPtr req = std::make_shared(addr, sizeof(T), 0, 0); + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr) 1) << 2); + + PacketPtr pkt = new Packet(req, MemCmd::UpdateWL); + + pkt->allocate(); + // pkt->setData(data); + pkt->setLE(value); + + return pkt; +} + +void +PushEngine::processNextUpdatePushEvent() +{ + int next_time_send = 0; + + for (int i = 0; i < outPorts.size(); i++) { + if (outPorts[i].blocked()) { + DPRINTF(PushEngine, "%s: Port %d blocked.\n", + __func__, outPorts[i].id()); + continue; + } + DPRINTF(PushEngine, "%s: Port %d available.\n", + __func__, outPorts[i].id()); + if (updateQueues[outPorts[i].id()].empty()) { + DPRINTF(PushEngine, "%s: Respective queue for port " + "%d is empty.\n", __func__, outPorts[i].id()); + continue; + } + DPRINTF(PushEngine, "%s: Respective queue for port " + "%d not empty.\n", __func__, outPorts[i].id()); + Update update; + Tick entrance_tick; + std::tie(update, entrance_tick) = updateQueues[outPorts[i].id()].front(); + PacketPtr pkt = createUpdatePacket(update.dst, update.value); + outPorts[i].sendPacket(pkt); + DPRINTF(PushEngine, "%s: Sent update: %s to port %d. " + "Respective queue size is %d.\n", __func__, + update.to_string(), outPorts[i].id(), + updateQueues[outPorts[i].id()].size()); + updateQueues[outPorts[i].id()].pop_front(); + if (updateQueues[outPorts[i].id()].size() > 0) { + next_time_send += 1; + } + stats.numUpdates++; + } + + assert(!nextUpdatePushEvent.scheduled()); + if (next_time_send > 0) { + schedule(nextUpdatePushEvent, nextCycle()); + } +} + +PushEngine::PushStats::PushStats(PushEngine &_push) + : statistics::Group(&_push), + push(_push), + ADD_STAT(numPropagates, statistics::units::Count::get(), + "Number of propagate operations done."), + ADD_STAT(numNetBlocks, statistics::units::Count::get(), + "Number of updates blocked by network."), + // ADD_STAT(numIdleCycles, statistics::units::Count::get(), + // "Number of cycles PushEngine has been idle."), + ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(), + "Number of coalescions in the update queues."), + ADD_STAT(numUpdates, statistics::units::Count::get(), + "Number of updates sent to the network."), + ADD_STAT(numWastefulEdgesRead, statistics::units::Count::get(), + "Number of wasteful edges read from edge memory."), + ADD_STAT(TEPS, statistics::units::Rate::get(), + "Traversed Edges Per Second."), + ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(), + "Histogram of the latency of the edgePointerQueue."), + ADD_STAT(edgePointerQueueLength, statistics::units::Count::get(), + "Histogram of the size of the edgePointerQueue."), + ADD_STAT(edgeQueueLatency, statistics::units::Second::get(), + "Histogram of the latency of the metaEdgeQueue."), + ADD_STAT(edgeQueueLength, statistics::units::Count::get(), + "Histogram of the size of the metaEdgeQueue."), + ADD_STAT(updateQueueLength, statistics::units::Count::get(), + "Histogram of the length of updateQueues."), + ADD_STAT(numPropagatesHist, statistics::units::Count::get(), + "Histogram of number of propagates sent.") +{ +} + +void +PushEngine::PushStats::regStats() +{ + using namespace statistics; + + TEPS = numPropagates / simSeconds; + + edgePointerQueueLatency.init(64); + edgePointerQueueLength.init(64); + edgeQueueLatency.init(64); + edgeQueueLength.init(64); + updateQueueLength.init(64); + numPropagatesHist.init(push.params().max_propagates_per_cycle); +} + +} // namespace gem5 diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh new file mode 100644 index 0000000000..f51865acb3 --- /dev/null +++ b/src/accl/graph/sega/push_engine.hh @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ +#define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ + +#include "accl/graph/base/data_structs.hh" +#include "accl/graph/base/graph_workload.hh" +#include "accl/graph/sega/base_memory_engine.hh" +#include "accl/graph/sega/enums.hh" +#include "base/intmath.hh" +#include "params/PushEngine.hh" + +namespace gem5 +{ + +class MPU; + +class PushEngine : public BaseMemoryEngine +{ + private: + class ReqPort : public RequestPort + { + private: + PushEngine* owner; + PacketPtr blockedPacket; + PortID _id; + + public: + ReqPort(const std::string& name, PushEngine* owner, PortID id) : + RequestPort(name, owner), + owner(owner), blockedPacket(nullptr), _id(id) + {} + void sendPacket(PacketPtr pkt); + bool blocked() { return (blockedPacket != nullptr); } + PortID id() { return _id; } + + protected: + virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); + }; + + class EdgeReadInfoGen { + private: + Addr _src; + uint32_t _delta; + + Addr _start; + Addr _end; + size_t _step; + size_t _atom; + + public: + EdgeReadInfoGen(Addr src, uint32_t delta, Addr start, + Addr end, size_t step, size_t atom): + _src(src), _delta(delta), _start(start), + _end(end), _step(step), _atom(atom) + {} + + Addr src() { return _src; } + uint32_t delta() { return _delta; } + + std::tuple nextReadPacketInfo() + { + panic_if(done(), "Should not call nextPacketInfo when done.\n"); + Addr aligned_addr = roundDown(_start, _atom); + Addr offset = _start - aligned_addr; + int num_items = 0; + + if (_end > (aligned_addr + _atom)) { + num_items = (_atom - offset) / _step; + } else { + num_items = (_end - _start) / _step; + } + + return std::make_tuple(aligned_addr, offset, num_items); + } + + void iterate() + { + panic_if(done(), "Should not call iterate when done.\n"); + Addr aligned_addr = roundDown(_start, _atom); + _start = aligned_addr + _atom; + } + + bool done() { return (_start >= _end); } + }; + struct PushInfo { + Addr src; + uint32_t value; + Addr offset; + int numElements; + }; + MPU* owner; + GraphWorkload* graphWorkload; + + bool _running; + Tick lastIdleEntranceTick; + + AddrRangeList localAddrRange; + + int numPendingPulls; + int edgePointerQueueSize; + std::deque> edgePointerQueue; + std::unordered_map reqInfoMap; + + int onTheFlyMemReqs; + int edgeQueueSize; + int maxPropagatesPerCycle; + std::deque> metaEdgeQueue; + + int updateQueueSize; + template PacketPtr createUpdatePacket(Addr addr, T value); + bool enqueueUpdate(Update update); + std::unordered_map portAddrMap; + std::unordered_map>> updateQueues; + std::vector outPorts; + + bool vertexSpace(); + bool workLeft(); + + EventFunctionWrapper nextVertexPullEvent; + void processNextVertexPullEvent(); + + MemoryEvent nextMemoryReadEvent; + void processNextMemoryReadEvent(); + + EventFunctionWrapper nextPropagateEvent; + void processNextPropagateEvent(); + + EventFunctionWrapper nextUpdatePushEvent; + void processNextUpdatePushEvent(); + + struct PushStats : public statistics::Group + { + PushStats(PushEngine &push); + + void regStats() override; + + PushEngine &push; + + statistics::Scalar numMemoryBlocks; + statistics::Scalar numPropagates; + statistics::Scalar numNetBlocks; + // statistics::Scalar numIdleCycles; + statistics::Scalar updateQueueCoalescions; + statistics::Scalar numUpdates; + statistics::Scalar numWastefulEdgesRead; + + statistics::Formula TEPS; + + statistics::Histogram edgePointerQueueLatency; + statistics::Histogram edgePointerQueueLength; + statistics::Histogram edgeQueueLatency; + statistics::Histogram edgeQueueLength; + statistics::Histogram updateQueueLength; + statistics::Histogram numPropagatesHist; + }; + + PushStats stats; + + protected: + virtual void recvMemRetry(); + virtual bool handleMemResp(PacketPtr pkt); + + public: + PARAMS(PushEngine); + PushEngine(const Params& params); + Port& getPort(const std::string& if_name, + PortID idx = InvalidPortID) override; + virtual void init() override; + void registerMPU(MPU* mpu); + + void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; } + virtual void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); } + + void start(); + bool running() { return _running; } + void recvVertexPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree); + + void recvReqRetry(); + + bool done(); +}; + +} + +#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ diff --git a/src/accl/graph/sega/state_machine.md b/src/accl/graph/sega/state_machine.md new file mode 100644 index 0000000000..203c47cf02 --- /dev/null +++ b/src/accl/graph/sega/state_machine.md @@ -0,0 +1 @@ +# CoalesceEngine Block state machine \ No newline at end of file diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc new file mode 100644 index 0000000000..b4649b6a9d --- /dev/null +++ b/src/accl/graph/sega/wl_engine.cc @@ -0,0 +1,376 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/wl_engine.hh" + +#include "accl/graph/sega/mpu.hh" +#include "debug/SEGAStructureSize.hh" +#include "debug/WLEngine.hh" +#include "mem/packet_access.hh" +#include "sim/sim_exit.hh" + +namespace gem5 +{ + +WLEngine::WLEngine(const WLEngineParams& params): + BaseReduceEngine(params), + updateQueueSize(params.update_queue_size), + registerFileSize(params.register_file_size), + nextReadEvent([this]{ processNextReadEvent(); }, name()), + nextReduceEvent([this]{ processNextReduceEvent(); }, name()), + nextDoneSignalEvent([this] { processNextDoneSignalEvent(); }, name()), + stats(*this) +{ + for (int i = 0; i < params.port_in_ports_connection_count; ++i) { + inPorts.emplace_back( + name() + ".in_ports" + std::to_string(i), this, i); + } +} + +Port& +WLEngine::getPort(const std::string& if_name, PortID idx) +{ + if (if_name == "in_ports") { + return inPorts[idx]; + } else { + return ClockedObject::getPort(if_name, idx); + } +} + +void +WLEngine::init() +{ + for (int i = 0; i < inPorts.size(); i++){ + inPorts[i].sendRangeChange(); + } +} + +void +WLEngine::registerMPU(MPU* mpu) +{ + owner = mpu; +} + +AddrRangeList +WLEngine::getAddrRanges() +{ + return owner->getAddrRanges(); +} + +void +WLEngine::recvFunctional(PacketPtr pkt) +{ + owner->recvFunctional(pkt); +} + +AddrRangeList +WLEngine::RespPort::getAddrRanges() const +{ + return owner->getAddrRanges(); +} + +void +WLEngine::RespPort::checkRetryReq() +{ + if (needSendRetryReq) { + needSendRetryReq = false; + sendRetryReq(); + } +} + +bool +WLEngine::RespPort::recvTimingReq(PacketPtr pkt) +{ + if (!owner->handleIncomingUpdate(pkt)) { + needSendRetryReq = true; + return false; + } + + return true; +} + +Tick +WLEngine::RespPort::recvAtomic(PacketPtr pkt) +{ + panic("recvAtomic unimpl."); +} + +void +WLEngine::RespPort::recvFunctional(PacketPtr pkt) +{ + owner->recvFunctional(pkt); +} + +void +WLEngine::RespPort::recvRespRetry() +{ + panic("recvRespRetry from response port is called."); +} + +void +WLEngine::checkRetryReq() +{ + for (int i = 0; i < inPorts.size(); i++) { + inPorts[i].checkRetryReq(); + } +} + +bool +WLEngine::done() +{ + return registerFile.empty() && updateQueue.empty(); +} + +bool +WLEngine::handleIncomingUpdate(PacketPtr pkt) +{ + int slice_number = (int)(pkt->getAddr()/(owner->getSliceSize())); + if (slice_number != owner->getSliceCounter()) { + DPRINTF(WLEngine, "%s: Packet %lu slice number is: %d. The current " + "slice number is: %d, The total number of vertices/slice: %d \n", + __func__, pkt->getAddr(), slice_number, + owner->getSliceCounter(), + owner->getSliceSize()/sizeof(WorkListItem)); + bool ret = owner->bufferRemoteUpdate(slice_number, pkt); + if (done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); + } + return ret; + } + assert((updateQueueSize == 0) || (updateQueue.size() <= updateQueueSize)); + if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) { + return false; + } + + updateQueue.emplace_back(pkt->getAddr(), pkt->getLE(), curTick()); + DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the " + "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n", + __func__, pkt->getAddr(), pkt->getLE(), + updateQueue.size(), updateQueueSize); + DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the " + "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n", + __func__, pkt->getAddr(), pkt->getLE(), + updateQueue.size(), updateQueueSize); + + // delete the packet since it's not needed anymore. + delete pkt; + + if (!nextReadEvent.scheduled()) { + schedule(nextReadEvent, nextCycle()); + } + return true; +} + + +// TODO: Parameterize the number of pops WLEngine can do at a time. +// TODO: Add a histogram stats of the size of the updateQueue. Sample here. +void +WLEngine::processNextReadEvent() +{ + Addr update_addr; + uint32_t update_value; + Tick enter_tick; + std::tie(update_addr, update_value, enter_tick) = updateQueue.front(); + + DPRINTF(WLEngine, "%s: Looking at the front of the updateQueue. " + "(addr: %lu, value: %u).\n", __func__, update_addr, update_value); + + if ((registerFile.find(update_addr) == registerFile.end())) { + DPRINTF(WLEngine, "%s: No register already allocated for addr: %lu " + "in registerFile.\n", __func__, update_addr); + if (registerFile.size() < registerFileSize) { + DPRINTF(WLEngine, "%s: There are free registers available in the " + "registerFile.\n", __func__); + ReadReturnStatus read_status = owner->recvWLRead(update_addr); + if (read_status == ReadReturnStatus::ACCEPT) { + DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read " + "request to addr: %lu.\n", __func__, update_addr); + registerFile[update_addr] = update_value; + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, value: %u) " + "to registerFile. registerFile.size = %d, " + "registerFileSize = %d.\n", __func__, update_addr, + update_value, registerFile.size(), registerFileSize); + DPRINTF(WLEngine, "%s: Added (addr: %lu, value: %u) " + "to registerFile. registerFile.size = %d, " + "registerFileSize = %d.\n", __func__, update_addr, + update_value, registerFile.size(), registerFileSize); + updateQueue.pop_front(); + stats.updateQueueLatency.sample( + (curTick() - enter_tick) * 1e9 / getClockFrequency()); + DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) " + "from updateQueue. updateQueue.size = %d. " + "updateQueueSize = %d.\n", __func__, update_addr, + update_value, updateQueue.size(), updateQueueSize); + DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) " + "from updateQueue. updateQueue.size = %d. " + "updateQueueSize = %d.\n", __func__, update_addr, + update_value, updateQueue.size(), updateQueueSize); + checkRetryReq(); + vertexReadTime[update_addr] = curTick(); + } else { + if (read_status == ReadReturnStatus::REJECT_ROLL) { + updateQueue.pop_front(); + updateQueue.emplace_back( + update_addr, update_value, enter_tick); + DPRINTF(WLEngine, "%s: Received a reject from cache. " + "Rolling the update.\n", __func__); + stats.numUpdateRolls++; + } else { + DPRINTF(WLEngine, "%s: Received a reject from cache. " + "Not rolling the update.\n", __func__); + } + } + } else { + DPRINTF(WLEngine, "%s: There are no free registers " + "available in the registerFile.\n", __func__); + stats.registerShortage++; + } + } else { + DPRINTF(WLEngine, "%s: A register has already been allocated for " + "addr: %lu in registerFile. registerFile[%lu] = %u.\n", + __func__, update_addr, update_addr, registerFile[update_addr]); + registerFile[update_addr] = + graphWorkload->reduce(update_value, registerFile[update_addr]); + DPRINTF(WLEngine, "%s: Reduced the update_value: %u with the entry in" + " registerFile. registerFile[%lu] = %u.\n", __func__, + update_value, update_addr, registerFile[update_addr]); + stats.registerFileCoalesce++; + updateQueue.pop_front(); + stats.updateQueueLatency.sample( + (curTick() - enter_tick) * 1e9 / getClockFrequency()); + DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) " + "from updateQueue. updateQueue.size = %d. " + "updateQueueSize = %d.\n", __func__, update_addr, + update_value, updateQueue.size(), updateQueueSize); + DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) " + "from updateQueue. updateQueue.size = %d. " + "updateQueueSize = %d.\n", __func__, update_addr, + update_value, updateQueue.size(), updateQueueSize); + checkRetryReq(); + } + + if (!updateQueue.empty() && (!nextReadEvent.scheduled())) { + schedule(nextReadEvent, nextCycle()); + } +} + +void +WLEngine::handleIncomingWL(Addr addr, WorkListItem wl) +{ + assert(workListFile.size() <= registerFileSize); + + workListFile[addr] = wl; + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) to " + "workListFile. workListFile.size = %d.\n", __func__, addr, + graphWorkload->printWorkListItem(wl), workListFile.size()); + DPRINTF(WLEngine, "%s: Added (addr: %lu, wl: %s) to " + "workListFile. workListFile.size = %d.\n", __func__, addr, + graphWorkload->printWorkListItem(wl), workListFile.size()); + + stats.vertexReadLatency.sample( + ((curTick() - vertexReadTime[addr]) * 1e9) / getClockFrequency()); + vertexReadTime.erase(addr); + + assert(!workListFile.empty()); + if (!nextReduceEvent.scheduled()) { + schedule(nextReduceEvent, nextCycle()); + } +} + +void +WLEngine::processNextReduceEvent() +{ + for (auto &it : workListFile) { + Addr addr = it.first; + assert(registerFile.find(addr) != registerFile.end()); + uint32_t update_value = registerFile[addr]; + DPRINTF(WLEngine, "%s: Reducing between registerFile and workListFile" + ". registerFile[%lu] = %u, workListFile[%lu] = %s.\n", + __func__, addr, registerFile[addr], addr, + graphWorkload->printWorkListItem(workListFile[addr])); + // TODO: Generalize this to reduce function rather than just min + workListFile[addr].tempProp = + graphWorkload->reduce(update_value, workListFile[addr].tempProp); + DPRINTF(WLEngine, "%s: Reduction done. workListFile[%lu] = %s.\n", + __func__, addr, graphWorkload->printWorkListItem(workListFile[addr])); + stats.numReduce++; + + owner->recvWLWrite(addr, workListFile[addr]); + registerFile.erase(addr); + DPRINTF(SEGAStructureSize, "%s: Removed addr: %lu from registerFile. " + "registerFile.size = %d, registerFileSize = %d\n", + __func__, addr, registerFile.size(), registerFileSize); + DPRINTF(WLEngine, "%s: Removed addr: %lu from registerFile. " + "registerFile.size = %d, registerFileSize = %d\n", + __func__, addr, registerFile.size(), registerFileSize); + } + workListFile.clear(); + + if (done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); + } +} + +void +WLEngine::processNextDoneSignalEvent() +{ + if (done()) { + owner->recvDoneSignal(); + } +} + +WLEngine::WorkListStats::WorkListStats(WLEngine &_wl) + : statistics::Group(&_wl), + wl(_wl), + ADD_STAT(numReduce, statistics::units::Count::get(), + "Number of memory blocks read for vertecies"), + ADD_STAT(registerFileCoalesce, statistics::units::Count::get(), + "Number of memory blocks read for vertecies"), + ADD_STAT(registerShortage, statistics::units::Count::get(), + "Number of times updates were " + "stalled because of register shortage"), + ADD_STAT(numUpdateRolls, statistics::units::Count::get(), + "Number of times an update has been rolled back " + "to the back of the update queue due to cache reject."), + ADD_STAT(vertexReadLatency, statistics::units::Second::get(), + "Histogram of the latency of reading a vertex (ns)."), + ADD_STAT(updateQueueLatency, statistics::units::Second::get(), + "Histogram of the latency of dequeuing an update (ns).") +{ +} + +void +WLEngine::WorkListStats::regStats() +{ + using namespace statistics; + + vertexReadLatency.init(64); + updateQueueLatency.init(64); +} + +} // namespace gem5 diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh new file mode 100644 index 0000000000..fb147e692a --- /dev/null +++ b/src/accl/graph/sega/wl_engine.hh @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_WL_ENGINE_HH__ +#define __ACCL_GRAPH_SEGA_WL_ENGINE_HH__ + +#include +#include + +#include "accl/graph/base/base_reduce_engine.hh" +#include "accl/graph/base/graph_workload.hh" +#include "accl/graph/base/data_structs.hh" +#include "accl/graph/sega/enums.hh" +#include "base/statistics.hh" +#include "params/WLEngine.hh" + +namespace gem5 +{ + +class MPU; + +class WLEngine : public BaseReduceEngine +{ + private: + class RespPort : public ResponsePort + { + private: + WLEngine* owner; + bool needSendRetryReq; + PortID _id; + + public: + RespPort(const std::string& name, WLEngine* owner, PortID id): + ResponsePort(name, owner), + owner(owner), needSendRetryReq(false), _id(id) + {} + virtual AddrRangeList getAddrRanges() const; + + PortID id() { return _id; } + void checkRetryReq(); + + protected: + virtual bool recvTimingReq(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt); + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRespRetry(); + }; + + MPU* owner; + GraphWorkload* graphWorkload; + + std::vector inPorts; + + int updateQueueSize; + std::deque> updateQueue; + + int registerFileSize; + std::unordered_map registerFile; + std::unordered_map vertexReadTime; + std::unordered_map workListFile; + + EventFunctionWrapper nextReadEvent; + void processNextReadEvent(); + + EventFunctionWrapper nextReduceEvent; + void processNextReduceEvent(); + + EventFunctionWrapper nextDoneSignalEvent; + void processNextDoneSignalEvent(); + + struct WorkListStats : public statistics::Group + { + WorkListStats(WLEngine &worklist); + + void regStats() override; + + WLEngine &wl; + + statistics::Scalar numReduce; + statistics::Scalar registerFileCoalesce; + statistics::Scalar registerShortage; + statistics::Scalar numUpdateRolls; + + statistics::Histogram vertexReadLatency; + statistics::Histogram updateQueueLatency; + }; + + WorkListStats stats; + + public: + PARAMS(WLEngine); + WLEngine(const Params& params); + Port& getPort(const std::string& if_name, + PortID idx = InvalidPortID) override; + virtual void init() override; + void registerMPU(MPU* mpu); + + AddrRangeList getAddrRanges(); + void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; } + void recvFunctional(PacketPtr pkt); + + bool handleIncomingUpdate(PacketPtr pkt); + void handleIncomingWL(Addr addr, WorkListItem wl); + + void checkRetryReq(); + + bool done(); +}; + +} +#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__ diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh new file mode 100644 index 0000000000..620e97f654 --- /dev/null +++ b/src/accl/graph/sega/work_directory.hh @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__ +#define __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__ + +#include + +#include "accl/graph/base/data_structs.hh" +#include "base/addr_range.hh" +#include "base/types.hh" + +namespace gem5 +{ + +class WorkDirectory +{ + public: + virtual int activate(Addr atom_addr) = 0; + virtual int deactivate(Addr atom_addr) = 0; + virtual Addr getNextWork() = 0; + + virtual int workCount() = 0; + bool empty() { return workCount() == 0; } + + virtual void setLastAtomAddr(Addr atom_addr) = 0; +}; + +class PopCountDirectory: public WorkDirectory +{ + private: + AddrRange memoryRange; + + int numAtomsPerBlock; + int memoryAtomSize; + int blockSize; + + uint32_t _workCount; + + int numCounters; + int lastCounterIndex; + uint32_t* popCount; + + uint32_t prevIndex; + uint32_t currentCounter; + + UniqueFIFO activeBlockIndices; + + int getIndexFromAtomAddr(Addr atom_addr) + { + assert((atom_addr % memoryAtomSize) == 0); + Addr trimmed_addr = memoryRange.removeIntlvBits(atom_addr); + int index = (int) (trimmed_addr / blockSize); + return index; + } + + Addr getAtomAddrFromIndex(int block_index, int atom_index) + { + Addr block_addr = block_index * blockSize; + Addr trimmed_addr = block_addr + atom_index * memoryAtomSize; + return memoryRange.addIntlvBits(trimmed_addr); + } + + public: + PopCountDirectory(AddrRange mem_range, int atoms_per_block, int atom_size): + WorkDirectory(), + memoryRange(mem_range), numAtomsPerBlock(atoms_per_block), + memoryAtomSize(atom_size), _workCount(0), + prevIndex(-1), currentCounter(0) + { + blockSize = numAtomsPerBlock * memoryAtomSize; + int numCounters = (int) (memoryRange.size() / blockSize); + lastCounterIndex = numCounters - 1; + popCount = new uint32_t [numCounters]; + for (int index = 0; index < numCounters; index++) { + popCount[index] = 0; + } + activeBlockIndices = UniqueFIFO(numCounters); + } + + // CAUTION: This should only be called when the work + // directory **is not** tracking the the atom with atom_addr + virtual int activate(Addr atom_addr) + { + int index = getIndexFromAtomAddr(atom_addr); + uint32_t prev_count = popCount[index]; + popCount[index]++; + _workCount++; + activeBlockIndices.push_back(index); + assert(popCount[index] > prev_count); + assert(popCount[index] <= numAtomsPerBlock); + return popCount[index]; + } + + // CAUTION: This should only be called when the work + // directory **is** tracking the the atom with atom_addr + virtual int deactivate(Addr atom_addr) + { + int index = getIndexFromAtomAddr(atom_addr); + uint32_t prev_count = popCount[index]; + popCount[index]--; + _workCount--; + if (popCount[index] == 0) { + activeBlockIndices.erase(index); + } + assert(popCount[index] < prev_count); + assert(popCount[index] <= numAtomsPerBlock); + return popCount[index]; + } + + virtual int workCount() { return _workCount; } + + void setLastAtomAddr(Addr atom_addr) + { + lastCounterIndex = getIndexFromAtomAddr(atom_addr); + } + + // CAUTION: This directory only tracks active vertices in the memory + // and it does not have any information on the state of the cache and/or + // the active buffer or the write buffer. Therefore, it might generate a + // read request to an address that might be in any of those. In that case, + // the generated address should be ignored. + virtual Addr getNextWork() + { + // Why ask directory if it's empty? + assert(!activeBlockIndices.empty()); + int front_index = activeBlockIndices.front(); + assert(popCount[front_index] > 0); + if ((prevIndex != -1) && (prevIndex != front_index)) { + currentCounter = 0; + } + if (currentCounter == numAtomsPerBlock) { + currentCounter = 0; + activeBlockIndices.pop_front(); + activeBlockIndices.push_back(front_index); + } + int current_index = activeBlockIndices.front(); + Addr ret_addr = getAtomAddrFromIndex(current_index, currentCounter); + prevIndex = current_index; + currentCounter++; + return ret_addr; + } +}; + +} // namespace gem5 + +#endif // __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__ diff --git a/src/base/addr_range.hh b/src/base/addr_range.hh index 07bd255d26..3c5c150b29 100644 --- a/src/base/addr_range.hh +++ b/src/base/addr_range.hh @@ -48,6 +48,7 @@ #include "base/bitfield.hh" #include "base/cprintf.hh" +#include "base/intmath.hh" #include "base/logging.hh" #include "base/types.hh" @@ -732,6 +733,40 @@ class AddrRange { return !(*this == r); } + + friend AddrRange + mergePseudoChannelRanges(AddrRange left, AddrRange right, int pch_bit) + { + assert(left.interleaved()); + assert(right.interleaved()); + assert(left.mergesWith(right)); + + uint8_t old_left_match = left.intlvMatch; + uint8_t new_left_match = 0; + uint8_t old_right_match = right.intlvMatch; + uint8_t new_right_match = 0; + int new_bits = left.masks.size() - 1; + + // assumption: masks is sorted in ascending order + std::vector new_masks; + for (auto mask: left.masks) { + uint64_t lsb_mask = (mask ^ (mask - 1)) + 1; + if ((lsb_mask >> 1) != (1 << pch_bit)) { + new_masks.push_back(mask); + new_left_match |= ((old_left_match & 1) << new_bits); + new_left_match >>= 1; + new_right_match |= ((old_right_match & 1) << new_bits); + new_right_match >>= 1; + } + old_left_match >>= 1; + old_right_match >>= 1; + } + panic_if(new_left_match != new_right_match, + "The two ranges can not be a pseudo channel pair " + "given the pseudochannel bit position of params.pch_bit."); + + return AddrRange(left._start, left._end, new_masks, new_left_match); + } }; static inline AddrRangeList @@ -817,6 +852,16 @@ RangeSize(Addr start, Addr size) return AddrRange(start, start + size); } +inline bool +contains(AddrRangeList range_list, Addr addr) +{ + bool ret = false; + for (auto range: range_list) { + ret |= range.contains(addr); + } + return ret; +} + } // namespace gem5 #endif // __BASE_ADDR_RANGE_HH__ diff --git a/src/base/statistics.hh b/src/base/statistics.hh index 24cbf714f5..15aeff892e 100644 --- a/src/base/statistics.hh +++ b/src/base/statistics.hh @@ -1052,7 +1052,7 @@ class VectorBase : public DataWrapVec Proxy operator[](off_type index) { - assert (index < size()); + // assert (index < size()); return Proxy(this->self(), index); } }; diff --git a/src/mem/HBMCtrl.py b/src/mem/HBMCtrl.py index 0c7c1ea919..f7355d4b67 100644 --- a/src/mem/HBMCtrl.py +++ b/src/mem/HBMCtrl.py @@ -42,6 +42,8 @@ class HBMCtrl(MemCtrl): # HBMCtrl has been tested with two HBM_2000_4H_1x64 interfaces dram_2 = Param.DRAMInterface("DRAM memory interface") + pch_bit = Param.Int("Position of PseudoChannel bit in addresses.") + # For mixed traffic, HBMCtrl with HBM_2000_4H_1x64 interfaaces # gives the best results with following min_r/w_per_switch min_reads_per_switch = 64 diff --git a/src/mem/hbm_ctrl.cc b/src/mem/hbm_ctrl.cc index 99618c4b5f..efd46bbd54 100644 --- a/src/mem/hbm_ctrl.cc +++ b/src/mem/hbm_ctrl.cc @@ -45,6 +45,7 @@ namespace memory HBMCtrl::HBMCtrl(const HBMCtrlParams &p) : MemCtrl(p), + pchBit(p.pch_bit), retryRdReqPC1(false), retryWrReqPC1(false), nextReqEventPC1([this] {processNextReqEvent(pc1Int, respQueuePC1, respondEventPC1, nextReqEventPC1, retryWrReqPC1);}, @@ -233,7 +234,7 @@ HBMCtrl::recvTimingReq(PacketPtr pkt) bool is_pc0; // TODO: make the interleaving bit across pseudo channels a parameter - if (bits(pkt->getAddr(), 6) == 0) { + if (bits(pkt->getAddr(), pchBit) == 0) { is_pc0 = true; } else { is_pc0 = false; @@ -492,8 +493,11 @@ AddrRangeList HBMCtrl::getAddrRanges() { AddrRangeList ranges; - ranges.push_back(pc0Int->getAddrRange()); - ranges.push_back(pc1Int->getAddrRange()); + AddrRange pc0Int_range = pc0Int->getAddrRange(); + AddrRange pc1Int_range = pc1Int->getAddrRange(); + ranges.push_back( + mergePseudoChannelRanges(pc0Int_range, pc1Int_range, pchBit) + ); return ranges; } diff --git a/src/mem/hbm_ctrl.hh b/src/mem/hbm_ctrl.hh index c9045f0ae7..f204b8346f 100644 --- a/src/mem/hbm_ctrl.hh +++ b/src/mem/hbm_ctrl.hh @@ -72,7 +72,8 @@ class HBMCtrl : public MemCtrl } private: - + // Position of the pseudochannel bit in addresses. + int pchBit; /** * Remember if we have to retry a request for second pseudo channel. */ diff --git a/src/mem/mem_ctrl.cc b/src/mem/mem_ctrl.cc index c65d68a5a7..3cbacef800 100644 --- a/src/mem/mem_ctrl.cc +++ b/src/mem/mem_ctrl.cc @@ -212,7 +212,7 @@ MemCtrl::addToReadQueue(PacketPtr pkt, for (int cnt = 0; cnt < pkt_count; ++cnt) { unsigned size = std::min((addr | (burst_size - 1)) + 1, base_addr + pkt->getSize()) - addr; - stats.readPktSize[ceilLog2(size)]++; + // stats.readPktSize[ceilLog2(size)]++; stats.readBursts++; stats.requestorReadAccesses[pkt->requestorId()]++; diff --git a/src/mem/packet.cc b/src/mem/packet.cc index 31dc330cab..daf9d18e88 100644 --- a/src/mem/packet.cc +++ b/src/mem/packet.cc @@ -237,6 +237,7 @@ MemCmd::commandInfo[] = { {IsRead, IsResponse}, InvalidCmd, "HTMReqResp" }, { {IsRead, IsRequest}, InvalidCmd, "HTMAbort" }, { {IsRequest}, InvalidCmd, "TlbiExtSync" }, + { {IsRequest, HasData}, InvalidCmd, "UpdateWL"} }; AddrRange diff --git a/src/mem/packet.hh b/src/mem/packet.hh index 9238dbec00..5332ee32a2 100644 --- a/src/mem/packet.hh +++ b/src/mem/packet.hh @@ -148,6 +148,8 @@ class MemCmd HTMAbort, // Tlb shootdown TlbiExtSync, + // MPU Accelerator + UpdateWL, NUM_MEM_CMDS }; diff --git a/src/mem/port_proxy.cc b/src/mem/port_proxy.cc index 19e1a53e84..55145ab7d7 100644 --- a/src/mem/port_proxy.cc +++ b/src/mem/port_proxy.cc @@ -56,7 +56,7 @@ PortProxy::PortProxy(const RequestPort &port, unsigned int cache_line_size) : void PortProxy::readBlobPhys(Addr addr, Request::Flags flags, - void *p, int size) const + void *p, Addr size) const { for (ChunkGenerator gen(addr, size, _cacheLineSize); !gen.done(); gen.next()) { @@ -73,7 +73,7 @@ PortProxy::readBlobPhys(Addr addr, Request::Flags flags, void PortProxy::writeBlobPhys(Addr addr, Request::Flags flags, - const void *p, int size) const + const void *p, Addr size) const { for (ChunkGenerator gen(addr, size, _cacheLineSize); !gen.done(); gen.next()) { @@ -90,7 +90,7 @@ PortProxy::writeBlobPhys(Addr addr, Request::Flags flags, void PortProxy::memsetBlobPhys(Addr addr, Request::Flags flags, - uint8_t v, int size) const + uint8_t v, Addr size) const { // quick and dirty... uint8_t *buf = new uint8_t[size]; diff --git a/src/mem/port_proxy.hh b/src/mem/port_proxy.hh index 29f6ba60a4..8cd21322ea 100644 --- a/src/mem/port_proxy.hh +++ b/src/mem/port_proxy.hh @@ -120,19 +120,19 @@ class PortProxy : FunctionalRequestProtocol * Read size bytes memory at physical address and store in p. */ void readBlobPhys(Addr addr, Request::Flags flags, - void *p, int size) const; + void *p, Addr size) const; /** * Write size bytes from p to physical address. */ void writeBlobPhys(Addr addr, Request::Flags flags, - const void *p, int size) const; + const void *p, Addr size) const; /** * Fill size bytes starting at physical addr with byte value val. */ void memsetBlobPhys(Addr addr, Request::Flags flags, - uint8_t v, int size) const; + uint8_t v, Addr size) const; @@ -143,7 +143,7 @@ class PortProxy : FunctionalRequestProtocol * Returns true on success and false on failure. */ virtual bool - tryReadBlob(Addr addr, void *p, int size) const + tryReadBlob(Addr addr, void *p, Addr size) const { readBlobPhys(addr, 0, p, size); return true; @@ -154,7 +154,7 @@ class PortProxy : FunctionalRequestProtocol * Returns true on success and false on failure. */ virtual bool - tryWriteBlob(Addr addr, const void *p, int size) const + tryWriteBlob(Addr addr, const void *p, Addr size) const { writeBlobPhys(addr, 0, p, size); return true; @@ -165,7 +165,7 @@ class PortProxy : FunctionalRequestProtocol * Returns true on success and false on failure. */ virtual bool - tryMemsetBlob(Addr addr, uint8_t val, int size) const + tryMemsetBlob(Addr addr, uint8_t val, Addr size) const { memsetBlobPhys(addr, 0, val, size); return true; @@ -179,7 +179,7 @@ class PortProxy : FunctionalRequestProtocol * Same as tryReadBlob, but insists on success. */ void - readBlob(Addr addr, void *p, int size) const + readBlob(Addr addr, void *p, Addr size) const { if (!tryReadBlob(addr, p, size)) fatal("readBlob(%#x, ...) failed", addr); @@ -189,7 +189,7 @@ class PortProxy : FunctionalRequestProtocol * Same as tryWriteBlob, but insists on success. */ void - writeBlob(Addr addr, const void *p, int size) const + writeBlob(Addr addr, const void *p, Addr size) const { if (!tryWriteBlob(addr, p, size)) fatal("writeBlob(%#x, ...) failed", addr); @@ -199,7 +199,7 @@ class PortProxy : FunctionalRequestProtocol * Same as tryMemsetBlob, but insists on success. */ void - memsetBlob(Addr addr, uint8_t v, int size) const + memsetBlob(Addr addr, uint8_t v, Addr size) const { if (!tryMemsetBlob(addr, v, size)) fatal("memsetBlob(%#x, ...) failed", addr); diff --git a/src/mem/translating_port_proxy.cc b/src/mem/translating_port_proxy.cc index 8ab859f40d..bc698c1a07 100644 --- a/src/mem/translating_port_proxy.cc +++ b/src/mem/translating_port_proxy.cc @@ -86,7 +86,7 @@ TranslatingPortProxy::tryOnBlob(BaseMMU::Mode mode, TranslationGenPtr gen, } bool -TranslatingPortProxy::tryReadBlob(Addr addr, void *p, int size) const +TranslatingPortProxy::tryReadBlob(Addr addr, void *p, Addr size) const { constexpr auto mode = BaseMMU::Read; return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional( @@ -99,7 +99,7 @@ TranslatingPortProxy::tryReadBlob(Addr addr, void *p, int size) const bool TranslatingPortProxy::tryWriteBlob( - Addr addr, const void *p, int size) const + Addr addr, const void *p, Addr size) const { constexpr auto mode = BaseMMU::Write; return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional( @@ -111,7 +111,7 @@ TranslatingPortProxy::tryWriteBlob( } bool -TranslatingPortProxy::tryMemsetBlob(Addr addr, uint8_t v, int size) const +TranslatingPortProxy::tryMemsetBlob(Addr addr, uint8_t v, Addr size) const { constexpr auto mode = BaseMMU::Write; return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional( diff --git a/src/mem/translating_port_proxy.hh b/src/mem/translating_port_proxy.hh index bedb57a3ce..7e619784b1 100644 --- a/src/mem/translating_port_proxy.hh +++ b/src/mem/translating_port_proxy.hh @@ -77,16 +77,16 @@ class TranslatingPortProxy : public PortProxy /** Version of tryReadblob that translates virt->phys and deals * with page boundries. */ - bool tryReadBlob(Addr addr, void *p, int size) const override; + bool tryReadBlob(Addr addr, void *p, Addr size) const override; /** Version of tryWriteBlob that translates virt->phys and deals * with page boundries. */ - bool tryWriteBlob(Addr addr, const void *p, int size) const override; + bool tryWriteBlob(Addr addr, const void *p, Addr size) const override; /** * Fill size bytes starting at addr with byte value val. */ - bool tryMemsetBlob(Addr address, uint8_t v, int size) const override; + bool tryMemsetBlob(Addr address, uint8_t v, Addr size) const override; }; } // namespace gem5